1 /* Copyright 2016 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <math.h>  /* for abs() */
7 #include <stdio.h>  /* for printf() */
8 #include <string.h> /* for memset() */
9 #include <stdint.h> /* for uint64 definition */
10 #include <stdlib.h> /* for exit() definition */
11 #include <time.h> /* for clock_gettime */
12 
13 #include "../drc_math.h"
14 #include "../dsp_util.h"
15 
16 
17 /* Constant for converting time to milliseconds. */
18 #define BILLION 1000000000LL
19 /* Number of iterations for performance testing. */
20 #define ITERATIONS 400000
21 
22 #if defined(__aarch64__)
float_to_short(float a)23 int16_t float_to_short(float a) {
24 	int32_t ret;
25 	asm volatile ("fcvtas %s[ret], %s[a]\n"
26 		      "sqxtn %h[ret], %s[ret]\n"
27 		      : [ret] "=w" (ret)
28 		      : [a] "w" (a)
29 		      :);
30 	return (int16_t)(ret);
31 }
32 #else
float_to_short(float a)33 int16_t float_to_short(float a) {
34 	a += (a >= 0) ? 0.5f : -0.5f;
35 	return (int16_t)(max(-32768, min(32767, a)));
36 }
37 #endif
38 
dsp_util_deinterleave_reference(int16_t * input,float * const * output,int channels,int frames)39 void dsp_util_deinterleave_reference(int16_t *input, float *const *output,
40 				     int channels, int frames)
41 {
42 	float *output_ptr[channels];
43 	int i, j;
44 
45 	for (i = 0; i < channels; i++)
46 		output_ptr[i] = output[i];
47 
48 	for (i = 0; i < frames; i++)
49 		for (j = 0; j < channels; j++)
50 			*(output_ptr[j]++) = *input++ / 32768.0f;
51 }
52 
dsp_util_interleave_reference(float * const * input,int16_t * output,int channels,int frames)53 void dsp_util_interleave_reference(float *const *input, int16_t *output,
54 				   int channels, int frames)
55 {
56 	float *input_ptr[channels];
57 	int i, j;
58 
59 	for (i = 0; i < channels; i++)
60 		input_ptr[i] = input[i];
61 
62 	for (i = 0; i < frames; i++)
63 		for (j = 0; j < channels; j++) {
64 			float f = *(input_ptr[j]++) * 32768.0f;
65 			*output++ = float_to_short(f);
66 		}
67 }
68 
69 /* Use fixed size allocation to avoid performance fluctuation of allocation. */
70 #define MAXSAMPLES 4096
71 #define MINSAMPLES 256
72 /* PAD buffer to check for overflows. */
73 #define PAD 4096
74 
TestRounding(float in,int16_t expected,int samples)75 void TestRounding(float in, int16_t expected, int samples)
76 {
77 	int i;
78 	int max_diff;
79 	int d;
80 
81 	short* in_shorts = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
82 	float* out_floats_left_c = (float*) malloc(MAXSAMPLES * 4 + PAD);
83 	float* out_floats_right_c = (float*) malloc(MAXSAMPLES * 4 + PAD);
84 	float* out_floats_left_opt = (float*) malloc(MAXSAMPLES * 4 + PAD);
85 	float* out_floats_right_opt = (float*) malloc(MAXSAMPLES * 4 + PAD);
86 	short* out_shorts_c = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
87 	short* out_shorts_opt = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
88 
89 	memset(in_shorts, 0xfb, MAXSAMPLES * 2 * 2 + PAD);
90 	memset(out_floats_left_c, 0xfb, MAXSAMPLES * 4 + PAD);
91 	memset(out_floats_right_c, 0xfb, MAXSAMPLES * 4 + PAD);
92 	memset(out_floats_left_opt, 0xfb, MAXSAMPLES * 4 + PAD);
93 	memset(out_floats_right_opt, 0xfb, MAXSAMPLES * 4 + PAD);
94 	memset(out_shorts_c, 0xfb, MAXSAMPLES * 2 * 2 + PAD);
95 	memset(out_shorts_opt, 0xfb, MAXSAMPLES * 2 * 2 + PAD);
96 
97 	float *out_floats_ptr_c[2];
98 	float *out_floats_ptr_opt[2];
99 
100 	out_floats_ptr_c[0] = out_floats_left_c;
101 	out_floats_ptr_c[1] = out_floats_right_c;
102 	out_floats_ptr_opt[0] = out_floats_left_opt;
103 	out_floats_ptr_opt[1] = out_floats_right_opt;
104 
105 	for (i = 0; i < MAXSAMPLES; ++i) {
106 		out_floats_left_c[i] = in;
107 		out_floats_right_c[i] = in;
108 	}
109 
110 	/*  reference C interleave */
111 	dsp_util_interleave_reference(out_floats_ptr_c, out_shorts_c, 2,
112 				      samples);
113 
114 	/* measure optimized interleave */
115 	for (i = 0; i < ITERATIONS; ++i) {
116 		dsp_util_interleave(out_floats_ptr_c, (uint8_t *)out_shorts_opt,
117 				    2, SND_PCM_FORMAT_S16_LE, samples);
118 	}
119 
120 	max_diff = 0;
121 	for (i = 0; i < (MAXSAMPLES * 2 + PAD / 2); ++i) {
122 		d = abs(out_shorts_c[i] - out_shorts_opt[i]);
123 		if (d > max_diff) {
124 			max_diff = d;
125 		}
126 	}
127 	printf("test interleave compare %6d, %10f %13f %6d %6d %6d %s\n",
128 		max_diff, in, in * 32768.0f, out_shorts_c[0], out_shorts_opt[0],
129 		expected,
130 		max_diff == 0 ? "PASS" : (out_shorts_opt[0] == expected ?
131 		"EXPECTED DIFFERENCE" : "UNEXPECTED DIFFERENCE"));
132 
133 	/* measure reference C deinterleave */
134 	dsp_util_deinterleave_reference(in_shorts, out_floats_ptr_c, 2,
135 					samples);
136 
137 	/* measure optimized deinterleave */
138 	dsp_util_deinterleave((uint8_t *)in_shorts, out_floats_ptr_opt, 2,
139 			      SND_PCM_FORMAT_S16_LE, samples);
140 
141 	d = memcmp(out_floats_ptr_c[0], out_floats_ptr_opt[0], samples * 4);
142 	if (d) printf("left compare %d, %f %f\n", d, out_floats_ptr_c[0][0],
143 		      out_floats_ptr_opt[0][0]);
144 	d = memcmp(out_floats_ptr_c[1], out_floats_ptr_opt[1], samples * 4);
145 	if (d) printf("right compare %d, %f %f\n", d, out_floats_ptr_c[1][0],
146 		      out_floats_ptr_opt[1][0]);
147 
148 	free(in_shorts);
149 	free(out_floats_left_c);
150 	free(out_floats_right_c);
151 	free(out_floats_left_opt);
152 	free(out_floats_right_opt);
153 	free(out_shorts_c);
154 	free(out_shorts_opt);
155 }
156 
main(int argc,char ** argv)157 int main(int argc, char **argv)
158 {
159 	float e = 0.000000001f;
160 	int samples = 16;
161 
162 	dsp_enable_flush_denormal_to_zero();
163 
164 	// Print headings for TestRounding output.
165 	printf("test interleave compare maxdif,     float,   float * 32k      "
166 	       "C   SIMD expect pass\n");
167 
168 	// test clamping
169 	TestRounding(1.0f, 32767, samples);
170 	TestRounding(-1.0f, -32768, samples);
171 	TestRounding(1.1f, 32767, samples);
172 	TestRounding(-1.1f, -32768, samples);
173 	TestRounding(2000000000.f / 32768.f, 32767, samples);
174 	TestRounding(-2000000000.f / 32768.f, -32768, samples);
175 
176 	/* Infinity produces zero on arm64. */
177 #if defined(__aarch64__)
178 #define EXPECTED_INF_RESULT 0
179 #define EXPECTED_NEGINF_RESULT 0
180 #elif defined(__i386__) || defined(__x86_64__)
181 #define EXPECTED_INF_RESULT -32768
182 #define EXPECTED_NEGINF_RESULT 0
183 #else
184 #define EXPECTED_INF_RESULT 32767
185 #define EXPECTED_NEGINF_RESULT -32768
186 #endif
187 
188 	TestRounding(5000000000.f / 32768.f, EXPECTED_INF_RESULT, samples);
189 	TestRounding(-5000000000.f / 32768.f, EXPECTED_NEGINF_RESULT, samples);
190 
191 	// test infinity
192 	union ieee754_float inf;
193 	inf.ieee.negative = 0;
194 	inf.ieee.exponent = 0xfe;
195 	inf.ieee.mantissa = 0x7fffff;
196 	TestRounding(inf.f, EXPECTED_INF_RESULT, samples);  // expect fail
197 	inf.ieee.negative = 1;
198 	inf.ieee.exponent = 0xfe;
199 	inf.ieee.mantissa = 0x7fffff;
200 	TestRounding(inf.f, EXPECTED_NEGINF_RESULT, samples);  // expect fail
201 
202 	// test rounding
203 	TestRounding(0.25f, 8192, samples);
204 	TestRounding(-0.25f, -8192, samples);
205 	TestRounding(0.50f, 16384, samples);
206 	TestRounding(-0.50f, -16384, samples);
207 	TestRounding(1.0f / 32768.0f, 1, samples);
208 	TestRounding(-1.0f / 32768.0f, -1, samples);
209 	TestRounding(1.0f / 32768.0f + e, 1, samples);
210 	TestRounding(-1.0f / 32768.0f - e, -1, samples);
211 	TestRounding(1.0f / 32768.0f - e, 1, samples);
212 	TestRounding(-1.0f / 32768.0f + e, -1, samples);
213 
214 	/* Rounding on 'tie' is different for Intel. */
215 #if defined(__i386__) || defined(__x86_64__)
216 	TestRounding(0.5f / 32768.0f, 0, samples);  /* Expect round to even */
217 	TestRounding(-0.5f / 32768.0f, 0, samples);
218 #else
219 	TestRounding(0.5f / 32768.0f, 1, samples);  /* Expect round away */
220 	TestRounding(-0.5f / 32768.0f, -1, samples);
221 #endif
222 
223 	TestRounding(0.5f / 32768.0f + e, 1, samples);
224 	TestRounding(-0.5f / 32768.0f - e, 1, samples);
225 	TestRounding(0.5f / 32768.0f - e, 0, samples);
226 	TestRounding(-0.5f / 32768.0f + e, 0, samples);
227 
228 	TestRounding(1.5f / 32768.0f, 2, samples);
229 	TestRounding(-1.5f / 32768.0f, -2, samples);
230 	TestRounding(1.5f / 32768.0f + e, 2, samples);
231 	TestRounding(-1.5f / 32768.0f - e, -2, samples);
232 	TestRounding(1.5f / 32768.0f - e, 1, samples);
233 	TestRounding(-1.5f / 32768.0f + e, -1, samples);
234 
235 	/* Test denormals */
236 	union ieee754_float denorm;
237 	denorm.ieee.negative = 0;
238 	denorm.ieee.exponent = 0;
239 	denorm.ieee.mantissa = 1;
240 	TestRounding(denorm.f, 0, samples);
241 	denorm.ieee.negative = 1;
242 	denorm.ieee.exponent = 0;
243 	denorm.ieee.mantissa = 1;
244 	TestRounding(denorm.f, 0, samples);
245 
246 	/* Test NaNs. Caveat Results vary by implementation. */
247 #if defined(__i386__) || defined(__x86_64__)
248 #define EXPECTED_NAN_RESULT -32768
249 #else
250 #define EXPECTED_NAN_RESULT 0
251 #endif
252 	union ieee754_float nan;  /* Quiet NaN */
253 	nan.ieee.negative = 0;
254 	nan.ieee.exponent = 0xff;
255 	nan.ieee.mantissa = 0x400001;
256 	TestRounding(nan.f, EXPECTED_NAN_RESULT, samples);
257 	nan.ieee.negative = 0;
258 	nan.ieee.exponent = 0xff;
259 	nan.ieee.mantissa = 0x000001;  /* Signalling NaN */
260 	TestRounding(nan.f, EXPECTED_NAN_RESULT, samples);
261 
262 	/* Test Performance */
263 	uint64_t diff;
264 	struct timespec start, end;
265 	int i;
266 	int d;
267 
268 	short* in_shorts = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
269 	float* out_floats_left_c = (float*) malloc(MAXSAMPLES * 4 + PAD);
270 	float* out_floats_right_c = (float*) malloc(MAXSAMPLES * 4 + PAD);
271 	float* out_floats_left_opt = (float*) malloc(MAXSAMPLES * 4 + PAD);
272 	float* out_floats_right_opt = (float*) malloc(MAXSAMPLES * 4 + PAD);
273 	short* out_shorts_c = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
274 	short* out_shorts_opt = (short*) malloc(MAXSAMPLES * 2 * 2 + PAD);
275 
276 	memset(in_shorts, 0x11, MAXSAMPLES * 2 * 2 + PAD);
277 	memset(out_floats_left_c, 0x22, MAXSAMPLES * 4 + PAD);
278 	memset(out_floats_right_c, 0x33, MAXSAMPLES * 4 + PAD);
279 	memset(out_floats_left_opt, 0x44, MAXSAMPLES * 4 + PAD);
280 	memset(out_floats_right_opt, 0x55, MAXSAMPLES * 4 + PAD);
281 	memset(out_shorts_c, 0x66, MAXSAMPLES * 2 * 2 + PAD);
282 	memset(out_shorts_opt, 0x66, MAXSAMPLES * 2 * 2 + PAD);
283 
284 	float *out_floats_ptr_c[2];
285 	float *out_floats_ptr_opt[2];
286 
287 	out_floats_ptr_c[0] = out_floats_left_c;
288 	out_floats_ptr_c[1] = out_floats_right_c;
289 	out_floats_ptr_opt[0] = out_floats_left_opt;
290 	out_floats_ptr_opt[1] = out_floats_right_opt;
291 
292 	/* Benchmark dsp_util_interleave */
293 	for (samples = MAXSAMPLES; samples >= MINSAMPLES; samples /= 2) {
294 
295 		/* measure original C interleave */
296 		clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */
297 		for (i = 0; i < ITERATIONS; ++i) {
298 			dsp_util_interleave_reference(out_floats_ptr_c,
299 						      out_shorts_c,
300 						      2, samples);
301 		}
302 		clock_gettime(CLOCK_MONOTONIC, &end); /* mark the end time */
303 		diff = (BILLION * (end.tv_sec - start.tv_sec) +
304 			end.tv_nsec - start.tv_nsec) / 1000000;
305 		printf("interleave   ORIG size = %6d, elapsed time = %llu ms\n",
306 		       samples, (long long unsigned int) diff);
307 
308 		/* measure optimized interleave */
309 		clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */
310 		for (i = 0; i < ITERATIONS; ++i) {
311 			dsp_util_interleave(out_floats_ptr_c,
312 					    (uint8_t *)out_shorts_opt, 2,
313 					    SND_PCM_FORMAT_S16_LE, samples);
314 		}
315 		clock_gettime(CLOCK_MONOTONIC, &end); /* mark the end time */
316 		diff = (BILLION * (end.tv_sec - start.tv_sec) +
317 			end.tv_nsec - start.tv_nsec) / 1000000;
318 		printf("interleave   SIMD size = %6d, elapsed time = %llu ms\n",
319 		       samples, (long long unsigned int) diff);
320 
321 		/* Test C and SIMD output match */
322 		d = memcmp(out_shorts_c, out_shorts_opt,
323 			   MAXSAMPLES * 2 * 2 + PAD);
324 		if (d) printf("interleave compare %d, %d %d, %d %d\n", d,
325 			      out_shorts_c[0], out_shorts_c[1],
326 			      out_shorts_opt[0], out_shorts_opt[1]);
327 	}
328 
329 	/* Benchmark dsp_util_deinterleave */
330 	for (samples = MAXSAMPLES; samples >= MINSAMPLES; samples /= 2) {
331 
332 		/* Measure original C deinterleave */
333 		clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */
334 		for (i = 0; i < ITERATIONS; ++i) {
335 			dsp_util_deinterleave_reference(in_shorts,
336 							out_floats_ptr_c,
337 							2, samples);
338 		}
339 		clock_gettime(CLOCK_MONOTONIC, &end); /* mark the end time */
340 		diff = (BILLION * (end.tv_sec - start.tv_sec) +
341 			end.tv_nsec - start.tv_nsec) / 1000000;
342 			printf("deinterleave ORIG size = %6d, "
343 			       "elapsed time = %llu ms\n",
344 			       samples, (long long unsigned int) diff);
345 
346 		/* Measure optimized deinterleave */
347 		clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */
348 		for (i = 0; i < ITERATIONS; ++i) {
349 			dsp_util_deinterleave((uint8_t *)in_shorts,
350 					      out_floats_ptr_opt, 2,
351 					      SND_PCM_FORMAT_S16_LE, samples);
352 		}
353 		clock_gettime(CLOCK_MONOTONIC, &end); /* mark the end time */
354 		diff = (BILLION * (end.tv_sec - start.tv_sec) +
355 			end.tv_nsec - start.tv_nsec) / 1000000;
356 		printf("deinterleave SIMD size = %6d, elapsed time = %llu ms\n",
357 			samples, (long long unsigned int) diff);
358 
359 		/* Test C and SIMD output match */
360 		d = memcmp(out_floats_ptr_c[0], out_floats_ptr_opt[0],
361 			   samples * 4);
362 		if (d) printf("left compare %d, %f %f\n", d,
363 			      out_floats_ptr_c[0][0], out_floats_ptr_opt[0][0]);
364 		d = memcmp(out_floats_ptr_c[1], out_floats_ptr_opt[1],
365 			   samples * 4);
366 		if (d) printf("right compare %d, %f %f\n", d,
367 			      out_floats_ptr_c[1][0], out_floats_ptr_opt[1][0]);
368 	}
369 
370 	free(in_shorts);
371 	free(out_floats_left_c);
372 	free(out_floats_right_c);
373 	free(out_floats_left_opt);
374 	free(out_floats_right_opt);
375 	free(out_shorts_c);
376 	free(out_shorts_opt);
377 
378 	return 0;
379 }
380