1# Copyright 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""A collection of statistical utility functions to be used by metrics.""" 6 7import math 8 9 10def Clamp(value, low=0.0, high=1.0): 11 """Clamp a value between some low and high value.""" 12 return min(max(value, low), high) 13 14 15def NormalizeSamples(samples): 16 """Sorts the samples, and map them linearly to the range [0,1]. 17 18 They're mapped such that for the N samples, the first sample is 0.5/N and the 19 last sample is (N-0.5)/N. 20 21 Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N, 22 twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case 23 we don't want to distinguish between these two cases, as our original domain 24 is not bounded (it is for Monte Carlo integration, where discrepancy was 25 first used). 26 """ 27 if not samples: 28 return samples, 1.0 29 samples = sorted(samples) 30 low = min(samples) 31 high = max(samples) 32 new_low = 0.5 / len(samples) 33 new_high = (len(samples)-0.5) / len(samples) 34 if high-low == 0.0: 35 return [0.5] * len(samples), 1.0 36 scale = (new_high - new_low) / (high - low) 37 for i in xrange(0, len(samples)): 38 samples[i] = float(samples[i] - low) * scale + new_low 39 return samples, scale 40 41 42def Discrepancy(samples, location_count=None): 43 """Computes the discrepancy of a set of 1D samples from the interval [0,1]. 44 45 The samples must be sorted. We define the discrepancy of an empty set 46 of samples to be zero. 47 48 http://en.wikipedia.org/wiki/Low-discrepancy_sequence 49 http://mathworld.wolfram.com/Discrepancy.html 50 """ 51 if not samples: 52 return 0.0 53 54 max_local_discrepancy = 0 55 inv_sample_count = 1.0 / len(samples) 56 locations = [] 57 # For each location, stores the number of samples less than that location. 58 count_less = [] 59 # For each location, stores the number of samples less than or equal to that 60 # location. 61 count_less_equal = [] 62 63 if location_count: 64 # Generate list of equally spaced locations. 65 sample_index = 0 66 for i in xrange(0, int(location_count)): 67 location = float(i) / (location_count-1) 68 locations.append(location) 69 while sample_index < len(samples) and samples[sample_index] < location: 70 sample_index += 1 71 count_less.append(sample_index) 72 while sample_index < len(samples) and samples[sample_index] <= location: 73 sample_index += 1 74 count_less_equal.append(sample_index) 75 else: 76 # Populate locations with sample positions. Append 0 and 1 if necessary. 77 if samples[0] > 0.0: 78 locations.append(0.0) 79 count_less.append(0) 80 count_less_equal.append(0) 81 for i in xrange(0, len(samples)): 82 locations.append(samples[i]) 83 count_less.append(i) 84 count_less_equal.append(i+1) 85 if samples[-1] < 1.0: 86 locations.append(1.0) 87 count_less.append(len(samples)) 88 count_less_equal.append(len(samples)) 89 90 # Compute discrepancy as max(overshoot, -undershoot), where 91 # overshoot = max(count_closed(i, j)/N - length(i, j)) for all i < j, 92 # undershoot = min(count_open(i, j)/N - length(i, j)) for all i < j, 93 # N = len(samples), 94 # count_closed(i, j) is the number of points between i and j including ends, 95 # count_open(i, j) is the number of points between i and j excluding ends, 96 # length(i, j) is locations[i] - locations[j]. 97 98 # The following algorithm is modification of Kadane's algorithm, 99 # see https://en.wikipedia.org/wiki/Maximum_subarray_problem. 100 101 # The maximum of (count_closed(k, i-1)/N - length(k, i-1)) for any k < i-1. 102 max_diff = 0 103 # The minimum of (count_open(k, i-1)/N - length(k, i-1)) for any k < i-1. 104 min_diff = 0 105 for i in xrange(1, len(locations)): 106 length = locations[i] - locations[i - 1] 107 count_closed = count_less_equal[i] - count_less[i - 1] 108 count_open = count_less[i] - count_less_equal[i - 1] 109 # Number of points that are added if we extend a closed range that 110 # ends at location (i-1). 111 count_closed_increment = count_less_equal[i] - count_less_equal[i - 1] 112 # Number of points that are added if we extend an open range that 113 # ends at location (i-1). 114 count_open_increment = count_less[i] - count_less[i - 1] 115 116 # Either extend the previous optimal range or start a new one. 117 max_diff = max( 118 float(count_closed_increment) * inv_sample_count - length + max_diff, 119 float(count_closed) * inv_sample_count - length) 120 min_diff = min( 121 float(count_open_increment) * inv_sample_count - length + min_diff, 122 float(count_open) * inv_sample_count - length) 123 124 max_local_discrepancy = max(max_diff, -min_diff, max_local_discrepancy) 125 return max_local_discrepancy 126 127 128def TimestampsDiscrepancy(timestamps, absolute=True, 129 location_count=None): 130 """A discrepancy based metric for measuring timestamp jank. 131 132 TimestampsDiscrepancy quantifies the largest area of jank observed in a series 133 of timestamps. Note that this is different from metrics based on the 134 max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and 135 B = [0,1,2,3,5,7] have the same max_time_interval = 2, but 136 Discrepancy(B) > Discrepancy(A). 137 138 Two variants of discrepancy can be computed: 139 140 Relative discrepancy is following the original definition of 141 discrepancy. It characterized the largest area of jank, relative to the 142 duration of the entire time stamp series. We normalize the raw results, 143 because the best case discrepancy for a set of N samples is 1/N (for 144 equally spaced samples), and we want our metric to report 0.0 in that 145 case. 146 147 Absolute discrepancy also characterizes the largest area of jank, but its 148 value wouldn't change (except for imprecisions due to a low 149 |interval_multiplier|) if additional 'good' intervals were added to an 150 exisiting list of time stamps. Its range is [0,inf] and the unit is 151 milliseconds. 152 153 The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same 154 absolute discrepancy, but D has lower relative discrepancy than C. 155 156 |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each 157 S_i is a time stamp series. In that case, the discrepancy D(S) is: 158 D(S) = max(D(S_1), D(S_2), ..., D(S_N)) 159 """ 160 if not timestamps: 161 return 0.0 162 163 if isinstance(timestamps[0], list): 164 range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps] 165 return max(range_discrepancies) 166 167 samples, sample_scale = NormalizeSamples(timestamps) 168 discrepancy = Discrepancy(samples, location_count) 169 inv_sample_count = 1.0 / len(samples) 170 if absolute: 171 # Compute absolute discrepancy 172 discrepancy /= sample_scale 173 else: 174 # Compute relative discrepancy 175 discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count)) 176 return discrepancy 177 178 179def DurationsDiscrepancy(durations, absolute=True, 180 location_count=None): 181 """A discrepancy based metric for measuring duration jank. 182 183 DurationsDiscrepancy computes a jank metric which measures how irregular a 184 given sequence of intervals is. In order to minimize jank, each duration 185 should be equally long. This is similar to how timestamp jank works, 186 and we therefore reuse the timestamp discrepancy function above to compute a 187 similar duration discrepancy number. 188 189 Because timestamp discrepancy is defined in terms of timestamps, we first 190 convert the list of durations to monotonically increasing timestamps. 191 192 Args: 193 durations: List of interval lengths in milliseconds. 194 absolute: See TimestampsDiscrepancy. 195 interval_multiplier: See TimestampsDiscrepancy. 196 """ 197 if not durations: 198 return 0.0 199 200 timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0]) 201 return TimestampsDiscrepancy(timestamps, absolute, location_count) 202 203 204def ArithmeticMean(data): 205 """Calculates arithmetic mean. 206 207 Args: 208 data: A list of samples. 209 210 Returns: 211 The arithmetic mean value, or 0 if the list is empty. 212 """ 213 numerator_total = Total(data) 214 denominator_total = Total(len(data)) 215 return DivideIfPossibleOrZero(numerator_total, denominator_total) 216 217 218def StandardDeviation(data): 219 """Calculates the standard deviation. 220 221 Args: 222 data: A list of samples. 223 224 Returns: 225 The standard deviation of the samples provided. 226 """ 227 if len(data) == 1: 228 return 0.0 229 230 mean = ArithmeticMean(data) 231 variances = [float(x) - mean for x in data] 232 variances = [x * x for x in variances] 233 std_dev = math.sqrt(ArithmeticMean(variances)) 234 235 return std_dev 236 237 238def TrapezoidalRule(data, dx): 239 """ Calculate the integral according to the trapezoidal rule 240 241 TrapezoidalRule approximates the definite integral of f from a to b by 242 the composite trapezoidal rule, using n subintervals. 243 http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid 244 245 Args: 246 data: A list of samples 247 dx: The uniform distance along the x axis between any two samples 248 249 Returns: 250 The area under the curve defined by the samples and the uniform distance 251 according to the trapezoidal rule. 252 """ 253 254 n = len(data) - 1 255 s = data[0] + data[n] 256 257 if n == 0: 258 return 0.0 259 260 for i in range(1, n): 261 s += 2 * data[i] 262 263 return s * dx / 2.0 264 265def Total(data): 266 """Returns the float value of a number or the sum of a list.""" 267 if type(data) == float: 268 total = data 269 elif type(data) == int: 270 total = float(data) 271 elif type(data) == list: 272 total = float(sum(data)) 273 else: 274 raise TypeError 275 return total 276 277 278def DivideIfPossibleOrZero(numerator, denominator): 279 """Returns the quotient, or zero if the denominator is zero.""" 280 return (float(numerator) / float(denominator)) if denominator else 0 281 282 283def GeneralizedMean(values, exponent): 284 """See http://en.wikipedia.org/wiki/Generalized_mean""" 285 if not values: 286 return 0.0 287 sum_of_powers = 0.0 288 for v in values: 289 sum_of_powers += v ** exponent 290 return (sum_of_powers / len(values)) ** (1.0/exponent) 291 292 293def Median(values): 294 """Gets the median of a list of values.""" 295 return Percentile(values, 50) 296 297 298def Percentile(values, percentile): 299 """Calculates the value below which a given percentage of values fall. 300 301 For example, if 17% of the values are less than 5.0, then 5.0 is the 17th 302 percentile for this set of values. When the percentage doesn't exactly 303 match a rank in the list of values, the percentile is computed using linear 304 interpolation between closest ranks. 305 306 Args: 307 values: A list of numerical values. 308 percentile: A number between 0 and 100. 309 310 Returns: 311 The Nth percentile for the list of values, where N is the given percentage. 312 """ 313 if not values: 314 return 0.0 315 sorted_values = sorted(values) 316 n = len(values) 317 percentile /= 100.0 318 if percentile <= 0.5 / n: 319 return sorted_values[0] 320 elif percentile >= (n - 0.5) / n: 321 return sorted_values[-1] 322 else: 323 floor_index = int(math.floor(n * percentile - 0.5)) 324 floor_value = sorted_values[floor_index] 325 ceil_value = sorted_values[floor_index+1] 326 alpha = n * percentile - 0.5 - floor_index 327 return floor_value + alpha * (ceil_value - floor_value) 328 329 330def GeometricMean(values): 331 """Compute a rounded geometric mean from an array of values.""" 332 if not values: 333 return None 334 # To avoid infinite value errors, make sure no value is less than 0.001. 335 new_values = [] 336 for value in values: 337 if value > 0.001: 338 new_values.append(value) 339 else: 340 new_values.append(0.001) 341 # Compute the sum of the log of the values. 342 log_sum = sum(map(math.log, new_values)) 343 # Raise e to that sum over the number of values. 344 mean = math.pow(math.e, (log_sum / len(new_values))) 345 # Return the rounded mean. 346 return int(round(mean)) 347