1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""A collection of statistical utility functions to be used by metrics."""
6
7import math
8
9
10def Clamp(value, low=0.0, high=1.0):
11  """Clamp a value between some low and high value."""
12  return min(max(value, low), high)
13
14
15def NormalizeSamples(samples):
16  """Sorts the samples, and map them linearly to the range [0,1].
17
18  They're mapped such that for the N samples, the first sample is 0.5/N and the
19  last sample is (N-0.5)/N.
20
21  Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
22  twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
23  we don't want to distinguish between these two cases, as our original domain
24  is not bounded (it is for Monte Carlo integration, where discrepancy was
25  first used).
26  """
27  if not samples:
28    return samples, 1.0
29  samples = sorted(samples)
30  low = min(samples)
31  high = max(samples)
32  new_low = 0.5 / len(samples)
33  new_high = (len(samples)-0.5) / len(samples)
34  if high-low == 0.0:
35    return [0.5] * len(samples), 1.0
36  scale = (new_high - new_low) / (high - low)
37  for i in xrange(0, len(samples)):
38    samples[i] = float(samples[i] - low) * scale + new_low
39  return samples, scale
40
41
42def Discrepancy(samples, location_count=None):
43  """Computes the discrepancy of a set of 1D samples from the interval [0,1].
44
45  The samples must be sorted. We define the discrepancy of an empty set
46  of samples to be zero.
47
48  http://en.wikipedia.org/wiki/Low-discrepancy_sequence
49  http://mathworld.wolfram.com/Discrepancy.html
50  """
51  if not samples:
52    return 0.0
53
54  max_local_discrepancy = 0
55  inv_sample_count = 1.0 / len(samples)
56  locations = []
57  # For each location, stores the number of samples less than that location.
58  count_less = []
59  # For each location, stores the number of samples less than or equal to that
60  # location.
61  count_less_equal = []
62
63  if location_count:
64    # Generate list of equally spaced locations.
65    sample_index = 0
66    for i in xrange(0, int(location_count)):
67      location = float(i) / (location_count-1)
68      locations.append(location)
69      while sample_index < len(samples) and samples[sample_index] < location:
70        sample_index += 1
71      count_less.append(sample_index)
72      while  sample_index < len(samples) and samples[sample_index] <= location:
73        sample_index += 1
74      count_less_equal.append(sample_index)
75  else:
76    # Populate locations with sample positions. Append 0 and 1 if necessary.
77    if samples[0] > 0.0:
78      locations.append(0.0)
79      count_less.append(0)
80      count_less_equal.append(0)
81    for i in xrange(0, len(samples)):
82      locations.append(samples[i])
83      count_less.append(i)
84      count_less_equal.append(i+1)
85    if samples[-1] < 1.0:
86      locations.append(1.0)
87      count_less.append(len(samples))
88      count_less_equal.append(len(samples))
89
90  # Compute discrepancy as max(overshoot, -undershoot), where
91  # overshoot = max(count_closed(i, j)/N - length(i, j)) for all i < j,
92  # undershoot = min(count_open(i, j)/N - length(i, j)) for all i < j,
93  # N = len(samples),
94  # count_closed(i, j) is the number of points between i and j including ends,
95  # count_open(i, j) is the number of points between i and j excluding ends,
96  # length(i, j) is locations[i] - locations[j].
97
98  # The following algorithm is modification of Kadane's algorithm,
99  # see https://en.wikipedia.org/wiki/Maximum_subarray_problem.
100
101  # The maximum of (count_closed(k, i-1)/N - length(k, i-1)) for any k < i-1.
102  max_diff = 0
103  # The minimum of (count_open(k, i-1)/N - length(k, i-1)) for any k < i-1.
104  min_diff = 0
105  for i in xrange(1, len(locations)):
106    length = locations[i] - locations[i - 1]
107    count_closed = count_less_equal[i] - count_less[i - 1]
108    count_open = count_less[i] - count_less_equal[i - 1]
109    # Number of points that are added if we extend a closed range that
110    # ends at location (i-1).
111    count_closed_increment = count_less_equal[i] - count_less_equal[i - 1]
112    # Number of points that are added if we extend an open range that
113    # ends at location (i-1).
114    count_open_increment = count_less[i] - count_less[i - 1]
115
116    # Either extend the previous optimal range or start a new one.
117    max_diff = max(
118        float(count_closed_increment) * inv_sample_count - length + max_diff,
119        float(count_closed) * inv_sample_count - length)
120    min_diff = min(
121        float(count_open_increment) * inv_sample_count - length + min_diff,
122        float(count_open) * inv_sample_count - length)
123
124    max_local_discrepancy = max(max_diff, -min_diff, max_local_discrepancy)
125  return max_local_discrepancy
126
127
128def TimestampsDiscrepancy(timestamps, absolute=True,
129                          location_count=None):
130  """A discrepancy based metric for measuring timestamp jank.
131
132  TimestampsDiscrepancy quantifies the largest area of jank observed in a series
133  of timestamps.  Note that this is different from metrics based on the
134  max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
135  B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
136  Discrepancy(B) > Discrepancy(A).
137
138  Two variants of discrepancy can be computed:
139
140  Relative discrepancy is following the original definition of
141  discrepancy. It characterized the largest area of jank, relative to the
142  duration of the entire time stamp series.  We normalize the raw results,
143  because the best case discrepancy for a set of N samples is 1/N (for
144  equally spaced samples), and we want our metric to report 0.0 in that
145  case.
146
147  Absolute discrepancy also characterizes the largest area of jank, but its
148  value wouldn't change (except for imprecisions due to a low
149  |interval_multiplier|) if additional 'good' intervals were added to an
150  exisiting list of time stamps.  Its range is [0,inf] and the unit is
151  milliseconds.
152
153  The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
154  absolute discrepancy, but D has lower relative discrepancy than C.
155
156  |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
157  S_i is a time stamp series. In that case, the discrepancy D(S) is:
158  D(S) = max(D(S_1), D(S_2), ..., D(S_N))
159  """
160  if not timestamps:
161    return 0.0
162
163  if isinstance(timestamps[0], list):
164    range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
165    return max(range_discrepancies)
166
167  samples, sample_scale = NormalizeSamples(timestamps)
168  discrepancy = Discrepancy(samples, location_count)
169  inv_sample_count = 1.0 / len(samples)
170  if absolute:
171    # Compute absolute discrepancy
172    discrepancy /= sample_scale
173  else:
174    # Compute relative discrepancy
175    discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
176  return discrepancy
177
178
179def DurationsDiscrepancy(durations, absolute=True,
180                         location_count=None):
181  """A discrepancy based metric for measuring duration jank.
182
183  DurationsDiscrepancy computes a jank metric which measures how irregular a
184  given sequence of intervals is. In order to minimize jank, each duration
185  should be equally long. This is similar to how timestamp jank works,
186  and we therefore reuse the timestamp discrepancy function above to compute a
187  similar duration discrepancy number.
188
189  Because timestamp discrepancy is defined in terms of timestamps, we first
190  convert the list of durations to monotonically increasing timestamps.
191
192  Args:
193    durations: List of interval lengths in milliseconds.
194    absolute: See TimestampsDiscrepancy.
195    interval_multiplier: See TimestampsDiscrepancy.
196  """
197  if not durations:
198    return 0.0
199
200  timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
201  return TimestampsDiscrepancy(timestamps, absolute, location_count)
202
203
204def ArithmeticMean(data):
205  """Calculates arithmetic mean.
206
207  Args:
208    data: A list of samples.
209
210  Returns:
211    The arithmetic mean value, or 0 if the list is empty.
212  """
213  numerator_total = Total(data)
214  denominator_total = Total(len(data))
215  return DivideIfPossibleOrZero(numerator_total, denominator_total)
216
217
218def StandardDeviation(data):
219  """Calculates the standard deviation.
220
221  Args:
222    data: A list of samples.
223
224  Returns:
225    The standard deviation of the samples provided.
226  """
227  if len(data) == 1:
228    return 0.0
229
230  mean = ArithmeticMean(data)
231  variances = [float(x) - mean for x in data]
232  variances = [x * x for x in variances]
233  std_dev = math.sqrt(ArithmeticMean(variances))
234
235  return std_dev
236
237
238def TrapezoidalRule(data, dx):
239  """ Calculate the integral according to the trapezoidal rule
240
241  TrapezoidalRule approximates the definite integral of f from a to b by
242  the composite trapezoidal rule, using n subintervals.
243  http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid
244
245  Args:
246    data: A list of samples
247    dx: The uniform distance along the x axis between any two samples
248
249  Returns:
250    The area under the curve defined by the samples and the uniform distance
251    according to the trapezoidal rule.
252  """
253
254  n = len(data) - 1
255  s = data[0] + data[n]
256
257  if n == 0:
258    return 0.0
259
260  for i in range(1, n):
261    s += 2 * data[i]
262
263  return s * dx / 2.0
264
265def Total(data):
266  """Returns the float value of a number or the sum of a list."""
267  if type(data) == float:
268    total = data
269  elif type(data) == int:
270    total = float(data)
271  elif type(data) == list:
272    total = float(sum(data))
273  else:
274    raise TypeError
275  return total
276
277
278def DivideIfPossibleOrZero(numerator, denominator):
279  """Returns the quotient, or zero if the denominator is zero."""
280  return (float(numerator) / float(denominator)) if denominator else 0
281
282
283def GeneralizedMean(values, exponent):
284  """See http://en.wikipedia.org/wiki/Generalized_mean"""
285  if not values:
286    return 0.0
287  sum_of_powers = 0.0
288  for v in values:
289    sum_of_powers += v ** exponent
290  return (sum_of_powers / len(values)) ** (1.0/exponent)
291
292
293def Median(values):
294  """Gets the median of a list of values."""
295  return Percentile(values, 50)
296
297
298def Percentile(values, percentile):
299  """Calculates the value below which a given percentage of values fall.
300
301  For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
302  percentile for this set of values. When the percentage doesn't exactly
303  match a rank in the list of values, the percentile is computed using linear
304  interpolation between closest ranks.
305
306  Args:
307    values: A list of numerical values.
308    percentile: A number between 0 and 100.
309
310  Returns:
311    The Nth percentile for the list of values, where N is the given percentage.
312  """
313  if not values:
314    return 0.0
315  sorted_values = sorted(values)
316  n = len(values)
317  percentile /= 100.0
318  if percentile <= 0.5 / n:
319    return sorted_values[0]
320  elif percentile >= (n - 0.5) / n:
321    return sorted_values[-1]
322  else:
323    floor_index = int(math.floor(n * percentile -  0.5))
324    floor_value = sorted_values[floor_index]
325    ceil_value = sorted_values[floor_index+1]
326    alpha = n * percentile - 0.5 - floor_index
327    return floor_value + alpha * (ceil_value - floor_value)
328
329
330def GeometricMean(values):
331  """Compute a rounded geometric mean from an array of values."""
332  if not values:
333    return None
334  # To avoid infinite value errors, make sure no value is less than 0.001.
335  new_values = []
336  for value in values:
337    if value > 0.001:
338      new_values.append(value)
339    else:
340      new_values.append(0.001)
341  # Compute the sum of the log of the values.
342  log_sum = sum(map(math.log, new_values))
343  # Raise e to that sum over the number of values.
344  mean = math.pow(math.e, (log_sum / len(new_values)))
345  # Return the rounded mean.
346  return int(round(mean))
347