1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <math.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include "igt_core.h"
30 #include "igt_stats.h"
31 
32 #define U64_MAX         ((uint64_t)~0ULL)
33 
34 #define sorted_value(stats, i) (stats->is_float ? stats->sorted_f[i] : stats->sorted_u64[i])
35 #define unsorted_value(stats, i) (stats->is_float ? stats->values_f[i] : stats->values_u64[i])
36 
37 /**
38  * SECTION:igt_stats
39  * @short_description: Tools for statistical analysis
40  * @title: Stats
41  * @include: igt.h
42  *
43  * Various tools to make sense of data.
44  *
45  * #igt_stats_t is a container of data samples. igt_stats_push() is used to add
46  * new samples and various results (mean, variance, standard deviation, ...)
47  * can then be retrieved.
48  *
49  * |[
50  *	igt_stats_t stats;
51  *
52  *	igt_stats_init(&stats, 8);
53  *
54  *	igt_stats_push(&stats, 2);
55  *	igt_stats_push(&stats, 4);
56  *	igt_stats_push(&stats, 4);
57  *	igt_stats_push(&stats, 4);
58  *	igt_stats_push(&stats, 5);
59  *	igt_stats_push(&stats, 5);
60  *	igt_stats_push(&stats, 7);
61  *	igt_stats_push(&stats, 9);
62  *
63  *	printf("Mean: %lf\n", igt_stats_get_mean(&stats));
64  *
65  *	igt_stats_fini(&stats);
66  * ]|
67  */
68 
get_new_capacity(int need)69 static unsigned int get_new_capacity(int need)
70 {
71 	unsigned int new_capacity;
72 
73 	/* taken from Python's list */
74 	new_capacity = (need >> 6) + (need < 9 ? 3 : 6);
75 	new_capacity += need;
76 
77 	return new_capacity;
78 }
79 
igt_stats_ensure_capacity(igt_stats_t * stats,unsigned int n_additional_values)80 static void igt_stats_ensure_capacity(igt_stats_t *stats,
81 				      unsigned int n_additional_values)
82 {
83 	unsigned int new_n_values = stats->n_values + n_additional_values;
84 	unsigned int new_capacity;
85 
86 	if (new_n_values <= stats->capacity)
87 		return;
88 
89 	new_capacity = get_new_capacity(new_n_values);
90 	stats->values_u64 = realloc(stats->values_u64,
91 				    sizeof(*stats->values_u64) * new_capacity);
92 	igt_assert(stats->values_u64);
93 
94 	stats->capacity = new_capacity;
95 
96 	free(stats->sorted_u64);
97 	stats->sorted_u64 = NULL;
98 }
99 
100 /**
101  * igt_stats_init:
102  * @stats: An #igt_stats_t instance
103  *
104  * Initializes an #igt_stats_t instance. igt_stats_fini() must be called once
105  * finished with @stats.
106  */
igt_stats_init(igt_stats_t * stats)107 void igt_stats_init(igt_stats_t *stats)
108 {
109 	memset(stats, 0, sizeof(*stats));
110 
111 	igt_stats_ensure_capacity(stats, 128);
112 
113 	stats->min = U64_MAX;
114 	stats->max = 0;
115 }
116 
117 /**
118  * igt_stats_init_with_size:
119  * @stats: An #igt_stats_t instance
120  * @capacity: Number of data samples @stats can contain
121  *
122  * Like igt_stats_init() but with a size to avoid reallocating the underlying
123  * array(s) when pushing new values. Useful if we have a good idea of the
124  * number of data points we want @stats to hold.
125  *
126  * igt_stats_fini() must be called once finished with @stats.
127  */
igt_stats_init_with_size(igt_stats_t * stats,unsigned int capacity)128 void igt_stats_init_with_size(igt_stats_t *stats, unsigned int capacity)
129 {
130 	memset(stats, 0, sizeof(*stats));
131 
132 	igt_stats_ensure_capacity(stats, capacity);
133 
134 	stats->min = U64_MAX;
135 	stats->max = 0;
136 	stats->range[0] = HUGE_VAL;
137 	stats->range[1] = -HUGE_VAL;
138 }
139 
140 /**
141  * igt_stats_fini:
142  * @stats: An #igt_stats_t instance
143  *
144  * Frees resources allocated in igt_stats_init().
145  */
igt_stats_fini(igt_stats_t * stats)146 void igt_stats_fini(igt_stats_t *stats)
147 {
148 	free(stats->values_u64);
149 	free(stats->sorted_u64);
150 }
151 
152 
153 /**
154  * igt_stats_is_population:
155  * @stats: An #igt_stats_t instance
156  *
157  * Returns: #true if @stats represents a population, #false if only a sample.
158  *
159  * See igt_stats_set_population() for more details.
160  */
igt_stats_is_population(igt_stats_t * stats)161 bool igt_stats_is_population(igt_stats_t *stats)
162 {
163 	return stats->is_population;
164 }
165 
166 /**
167  * igt_stats_set_population:
168  * @stats: An #igt_stats_t instance
169  * @full_population: Whether we're dealing with sample data or a full
170  *		     population
171  *
172  * In statistics, we usually deal with a subset of the full data (which may be
173  * a continuous or infinite set). Data analysis is then done on a sample of
174  * this population.
175  *
176  * This has some importance as only having a sample of the data leads to
177  * [biased estimators](https://en.wikipedia.org/wiki/Bias_of_an_estimator). We
178  * currently used the information given by this method to apply
179  * [Bessel's correction](https://en.wikipedia.org/wiki/Bessel%27s_correction)
180  * to the variance.
181  *
182  * Note that even if we manage to have an unbiased variance by multiplying
183  * a sample variance by the Bessel's correction, n/(n - 1), the standard
184  * deviation derived from the unbiased variance isn't itself unbiased.
185  * Statisticians talk about a "corrected" standard deviation.
186  *
187  * When giving #true to this function, the data set in @stats is considered a
188  * full population. It's considered a sample of a bigger population otherwise.
189  *
190  * When newly created, @stats defaults to holding sample data.
191  */
igt_stats_set_population(igt_stats_t * stats,bool full_population)192 void igt_stats_set_population(igt_stats_t *stats, bool full_population)
193 {
194 	if (full_population == stats->is_population)
195 		return;
196 
197 	stats->is_population = full_population;
198 	stats->mean_variance_valid = false;
199 }
200 
201 /**
202  * igt_stats_push:
203  * @stats: An #igt_stats_t instance
204  * @value: An integer value
205  *
206  * Adds a new value to the @stats dataset.
207  */
igt_stats_push(igt_stats_t * stats,uint64_t value)208 void igt_stats_push(igt_stats_t *stats, uint64_t value)
209 {
210 	if (stats->is_float) {
211 		igt_stats_push_float(stats, value);
212 		return;
213 	}
214 
215 	igt_stats_ensure_capacity(stats, 1);
216 
217 	stats->values_u64[stats->n_values++] = value;
218 
219 	stats->mean_variance_valid = false;
220 	stats->sorted_array_valid = false;
221 
222 	if (value < stats->min)
223 		stats->min = value;
224 	if (value > stats->max)
225 		stats->max = value;
226 }
227 
228 /**
229  * igt_stats_push:
230  * @stats: An #igt_stats_t instance
231  * @value: An floating point
232  *
233  * Adds a new value to the @stats dataset and converts the igt_stats from
234  * an integer collection to a floating point one.
235  */
igt_stats_push_float(igt_stats_t * stats,double value)236 void igt_stats_push_float(igt_stats_t *stats, double value)
237 {
238 	igt_stats_ensure_capacity(stats, 1);
239 
240 	if (!stats->is_float) {
241 		int n;
242 
243 		for (n = 0; n < stats->n_values; n++)
244 			stats->values_f[n] = stats->values_u64[n];
245 
246 		stats->is_float = true;
247 	}
248 
249 	stats->values_f[stats->n_values++] = value;
250 
251 	stats->mean_variance_valid = false;
252 	stats->sorted_array_valid = false;
253 
254 	if (value < stats->range[0])
255 		stats->range[0] = value;
256 	if (value > stats->range[1])
257 		stats->range[1] = value;
258 }
259 
260 /**
261  * igt_stats_push_array:
262  * @stats: An #igt_stats_t instance
263  * @values: (array length=n_values): A pointer to an array of data points
264  * @n_values: The number of data points to add
265  *
266  * Adds an array of values to the @stats dataset.
267  */
igt_stats_push_array(igt_stats_t * stats,const uint64_t * values,unsigned int n_values)268 void igt_stats_push_array(igt_stats_t *stats,
269 			  const uint64_t *values, unsigned int n_values)
270 {
271 	unsigned int i;
272 
273 	igt_stats_ensure_capacity(stats, n_values);
274 
275 	for (i = 0; i < n_values; i++)
276 		igt_stats_push(stats, values[i]);
277 }
278 
279 /**
280  * igt_stats_get_min:
281  * @stats: An #igt_stats_t instance
282  *
283  * Retrieves the minimal value in @stats
284  */
igt_stats_get_min(igt_stats_t * stats)285 uint64_t igt_stats_get_min(igt_stats_t *stats)
286 {
287 	igt_assert(!stats->is_float);
288 	return stats->min;
289 }
290 
291 /**
292  * igt_stats_get_max:
293  * @stats: An #igt_stats_t instance
294  *
295  * Retrieves the maximum value in @stats
296  */
igt_stats_get_max(igt_stats_t * stats)297 uint64_t igt_stats_get_max(igt_stats_t *stats)
298 {
299 	igt_assert(!stats->is_float);
300 	return stats->max;
301 }
302 
303 /**
304  * igt_stats_get_range:
305  * @stats: An #igt_stats_t instance
306  *
307  * Retrieves the range of the values in @stats. The range is the difference
308  * between the highest and the lowest value.
309  *
310  * The range can be a deceiving characterization of the values, because there
311  * can be extreme minimal and maximum values that are just anomalies. Prefer
312  * the interquatile range (see igt_stats_get_iqr()) or an histogram.
313  */
igt_stats_get_range(igt_stats_t * stats)314 uint64_t igt_stats_get_range(igt_stats_t *stats)
315 {
316 	return igt_stats_get_max(stats) - igt_stats_get_min(stats);
317 }
318 
cmp_u64(const void * pa,const void * pb)319 static int cmp_u64(const void *pa, const void *pb)
320 {
321 	const uint64_t *a = pa, *b = pb;
322 
323 	if (*a < *b)
324 		return -1;
325 	if (*a > *b)
326 		return 1;
327 	return 0;
328 }
329 
cmp_f(const void * pa,const void * pb)330 static int cmp_f(const void *pa, const void *pb)
331 {
332 	const double *a = pa, *b = pb;
333 
334 	if (*a < *b)
335 		return -1;
336 	if (*a > *b)
337 		return 1;
338 	return 0;
339 }
340 
igt_stats_ensure_sorted_values(igt_stats_t * stats)341 static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
342 {
343 	if (stats->sorted_array_valid)
344 		return;
345 
346 	if (!stats->sorted_u64) {
347 		/*
348 		 * igt_stats_ensure_capacity() will free ->sorted when the
349 		 * capacity increases, which also correspond to an invalidation
350 		 * of the sorted array. We'll then reallocate it here on
351 		 * demand.
352 		 */
353 		stats->sorted_u64 = calloc(stats->capacity,
354 					   sizeof(*stats->values_u64));
355 		igt_assert(stats->sorted_u64);
356 	}
357 
358 	memcpy(stats->sorted_u64, stats->values_u64,
359 	       sizeof(*stats->values_u64) * stats->n_values);
360 
361 	qsort(stats->sorted_u64, stats->n_values, sizeof(*stats->values_u64),
362 	      stats->is_float ? cmp_f : cmp_u64);
363 
364 	stats->sorted_array_valid = true;
365 }
366 
367 /*
368  * We use Tukey's hinge for our quartiles determination.
369  * ends (end, lower_end) are exclusive.
370  */
371 static double
igt_stats_get_median_internal(igt_stats_t * stats,unsigned int start,unsigned int end,unsigned int * lower_end,unsigned int * upper_start)372 igt_stats_get_median_internal(igt_stats_t *stats,
373 			      unsigned int start, unsigned int end,
374 			      unsigned int *lower_end /* out */,
375 			      unsigned int *upper_start /* out */)
376 {
377 	unsigned int mid, n_values = end - start;
378 	double median;
379 
380 	igt_stats_ensure_sorted_values(stats);
381 
382 	/* odd number of data points */
383 	if (n_values % 2 == 1) {
384 		/* median is the value in the middle (actual datum) */
385 		mid = start + n_values / 2;
386 		median = sorted_value(stats, mid);
387 
388 		/* the two halves contain the median value */
389 		if (lower_end)
390 			*lower_end = mid + 1;
391 		if (upper_start)
392 			*upper_start = mid;
393 
394 	/* even number of data points */
395 	} else {
396 		/*
397 		 * The middle is in between two indexes, 'mid' points at the
398 		 * lower one. The median is then the average between those two
399 		 * values.
400 		 */
401 		mid = start + n_values / 2 - 1;
402 		median = (sorted_value(stats, mid) + sorted_value(stats, mid+1))/2.;
403 
404 		if (lower_end)
405 			*lower_end = mid + 1;
406 		if (upper_start)
407 			*upper_start = mid + 1;
408 	}
409 
410 	return median;
411 }
412 
413 /**
414  * igt_stats_get_quartiles:
415  * @stats: An #igt_stats_t instance
416  * @q1: (out): lower or 25th quartile
417  * @q2: (out): median or 50th quartile
418  * @q3: (out): upper or 75th quartile
419  *
420  * Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the
421  * @stats dataset.
422  */
igt_stats_get_quartiles(igt_stats_t * stats,double * q1,double * q2,double * q3)423 void igt_stats_get_quartiles(igt_stats_t *stats,
424 			     double *q1, double *q2, double *q3)
425 {
426 	unsigned int lower_end, upper_start;
427 	double ret;
428 
429 	if (stats->n_values < 3) {
430 		if (q1)
431 			*q1 = 0.;
432 		if (q2)
433 			*q2 = 0.;
434 		if (q3)
435 			*q3 = 0.;
436 		return;
437 	}
438 
439 	ret = igt_stats_get_median_internal(stats, 0, stats->n_values,
440 					    &lower_end, &upper_start);
441 	if (q2)
442 		*q2 = ret;
443 
444 	ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL);
445 	if (q1)
446 		*q1 = ret;
447 
448 	ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values,
449 					    NULL, NULL);
450 	if (q3)
451 		*q3 = ret;
452 }
453 
454 /**
455  * igt_stats_get_iqr:
456  * @stats: An #igt_stats_t instance
457  *
458  * Retrieves the
459  * [interquartile range](https://en.wikipedia.org/wiki/Interquartile_range)
460  * (IQR) of the @stats dataset.
461  */
igt_stats_get_iqr(igt_stats_t * stats)462 double igt_stats_get_iqr(igt_stats_t *stats)
463 {
464 	double q1, q3;
465 
466 	igt_stats_get_quartiles(stats, &q1, NULL, &q3);
467 	return (q3 - q1);
468 }
469 
470 /**
471  * igt_stats_get_median:
472  * @stats: An #igt_stats_t instance
473  *
474  * Retrieves the median of the @stats dataset.
475  */
igt_stats_get_median(igt_stats_t * stats)476 double igt_stats_get_median(igt_stats_t *stats)
477 {
478 	return igt_stats_get_median_internal(stats, 0, stats->n_values,
479 					     NULL, NULL);
480 }
481 
482 /*
483  * Algorithm popularised by Knuth in:
484  *
485  * The Art of Computer Programming, volume 2: Seminumerical Algorithms,
486  * 3rd edn., p. 232. Boston: Addison-Wesley
487  *
488  * Source: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
489  */
igt_stats_knuth_mean_variance(igt_stats_t * stats)490 static void igt_stats_knuth_mean_variance(igt_stats_t *stats)
491 {
492 	double mean = 0., m2 = 0.;
493 	unsigned int i;
494 
495 	if (stats->mean_variance_valid)
496 		return;
497 
498 	for (i = 0; i < stats->n_values; i++) {
499 		double delta = unsorted_value(stats, i) - mean;
500 
501 		mean += delta / (i + 1);
502 		m2 += delta * (unsorted_value(stats, i) - mean);
503 	}
504 
505 	stats->mean = mean;
506 	if (stats->n_values > 1 && !stats->is_population)
507 		stats->variance = m2 / (stats->n_values - 1);
508 	else
509 		stats->variance = m2 / stats->n_values;
510 	stats->mean_variance_valid = true;
511 }
512 
513 /**
514  * igt_stats_get_mean:
515  * @stats: An #igt_stats_t instance
516  *
517  * Retrieves the mean of the @stats dataset.
518  */
igt_stats_get_mean(igt_stats_t * stats)519 double igt_stats_get_mean(igt_stats_t *stats)
520 {
521 	igt_stats_knuth_mean_variance(stats);
522 
523 	return stats->mean;
524 }
525 
526 /**
527  * igt_stats_get_variance:
528  * @stats: An #igt_stats_t instance
529  *
530  * Retrieves the variance of the @stats dataset.
531  */
igt_stats_get_variance(igt_stats_t * stats)532 double igt_stats_get_variance(igt_stats_t *stats)
533 {
534 	igt_stats_knuth_mean_variance(stats);
535 
536 	return stats->variance;
537 }
538 
539 /**
540  * igt_stats_get_std_deviation:
541  * @stats: An #igt_stats_t instance
542  *
543  * Retrieves the standard deviation of the @stats dataset.
544  */
igt_stats_get_std_deviation(igt_stats_t * stats)545 double igt_stats_get_std_deviation(igt_stats_t *stats)
546 {
547 	igt_stats_knuth_mean_variance(stats);
548 
549 	return sqrt(stats->variance);
550 }
551 
552 /**
553  * igt_stats_get_iqm:
554  * @stats: An #igt_stats_t instance
555  *
556  * Retrieves the
557  * [interquartile mean](https://en.wikipedia.org/wiki/Interquartile_mean) (IQM)
558  * of the @stats dataset.
559  *
560  * The interquartile mean is a "statistical measure of central tendency".
561  * It is a truncated mean that discards the lowest and highest 25% of values,
562  * and calculates the mean value of the remaining central values.
563  *
564  * It's useful to hide outliers in measurements (due to cold cache etc).
565  */
igt_stats_get_iqm(igt_stats_t * stats)566 double igt_stats_get_iqm(igt_stats_t *stats)
567 {
568 	unsigned int q1, q3, i;
569 	double mean;
570 
571 	igt_stats_ensure_sorted_values(stats);
572 
573 	q1 = (stats->n_values + 3) / 4;
574 	q3 = 3 * stats->n_values / 4;
575 
576 	mean = 0;
577 	for (i = 0; i <= q3 - q1; i++)
578 		mean += (sorted_value(stats, q1 + i) - mean) / (i + 1);
579 
580 	if (stats->n_values % 4) {
581 		double rem = .5 * (stats->n_values % 4) / 4;
582 
583 		q1 = (stats->n_values) / 4;
584 		q3 = (3 * stats->n_values + 3) / 4;
585 
586 		mean += rem * (sorted_value(stats, q1) - mean) / i++;
587 		mean += rem * (sorted_value(stats, q3) - mean) / i++;
588 	}
589 
590 	return mean;
591 }
592 
593 /**
594  * igt_stats_get_trimean:
595  * @stats: An #igt_stats_t instance
596  *
597  * Retrieves the [trimean](https://en.wikipedia.org/wiki/Trimean) of the @stats
598  * dataset.
599  *
600  * The trimean is a the most efficient 3-point L-estimator, even more
601  * robust than the median at estimating the average of a sample population.
602  */
igt_stats_get_trimean(igt_stats_t * stats)603 double igt_stats_get_trimean(igt_stats_t *stats)
604 {
605 	double q1, q2, q3;
606 	igt_stats_get_quartiles(stats, &q1, &q2, &q3);
607 	return (q1 + 2*q2 + q3) / 4;
608 }
609 
610 /**
611  * igt_mean_init:
612  * @m: tracking structure
613  *
614  * Initializes or resets @m.
615  */
igt_mean_init(struct igt_mean * m)616 void igt_mean_init(struct igt_mean *m)
617 {
618 	memset(m, 0, sizeof(*m));
619 	m->max = -HUGE_VAL;
620 	m->min = HUGE_VAL;
621 }
622 
623 /**
624  * igt_mean_add:
625  * @m: tracking structure
626  * @v: value
627  *
628  * Adds a new value @v to @m.
629  */
igt_mean_add(struct igt_mean * m,double v)630 void igt_mean_add(struct igt_mean *m, double v)
631 {
632 	double delta = v - m->mean;
633 	m->mean += delta / ++m->count;
634 	m->sq += delta * (v - m->mean);
635 	if (v < m->min)
636 		m->min = v;
637 	if (v > m->max)
638 		m->max = v;
639 }
640 
641 /**
642  * igt_mean_get:
643  * @m: tracking structure
644  *
645  * Computes the current mean of the samples tracked in @m.
646  */
igt_mean_get(struct igt_mean * m)647 double igt_mean_get(struct igt_mean *m)
648 {
649 	return m->mean;
650 }
651 
652 /**
653  * igt_mean_get_variance:
654  * @m: tracking structure
655  *
656  * Computes the current variance of the samples tracked in @m.
657  */
igt_mean_get_variance(struct igt_mean * m)658 double igt_mean_get_variance(struct igt_mean *m)
659 {
660 	return m->sq / m->count;
661 }
662 
663