1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <limits.h>
7 #include <syslog.h>
8 
9 #include "dsp_util.h"
10 
11 #ifndef max
12 #define max(a, b) ({ __typeof__(a) _a = (a);	\
13 			__typeof__(b) _b = (b);	\
14 			_a > _b ? _a : _b; })
15 #endif
16 
17 #ifndef min
18 #define min(a, b) ({ __typeof__(a) _a = (a);	\
19 			__typeof__(b) _b = (b);	\
20 			_a < _b ? _a : _b; })
21 #endif
22 
23 #undef deinterleave_stereo
24 #undef interleave_stereo
25 
26 /* Converts shorts in range of -32768 to 32767 to floats in range of
27  * -1.0f to 1.0f.
28  * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
29  * shorts to int with sign extension.
30  */
31 #ifdef __aarch64__
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)32 static void deinterleave_stereo(int16_t *input, float *output1,
33 				float *output2, int frames)
34 {
35 	int chunk = frames >> 3;
36 	frames &= 7;
37 	/* Process 8 frames (16 samples) each loop. */
38 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
39 	if (chunk) {
40 		__asm__ __volatile__ (
41 			"1:                                         \n"
42 			"ld2  {v2.8h, v3.8h}, [%[input]], #32       \n"
43 			"subs %w[chunk], %w[chunk], #1              \n"
44 			"sxtl   v0.4s, v2.4h                        \n"
45 			"sxtl2  v1.4s, v2.8h                        \n"
46 			"sxtl   v2.4s, v3.4h                        \n"
47 			"sxtl2  v3.4s, v3.8h                        \n"
48 			"scvtf  v0.4s, v0.4s, #15                   \n"
49 			"scvtf  v1.4s, v1.4s, #15                   \n"
50 			"scvtf  v2.4s, v2.4s, #15                   \n"
51 			"scvtf  v3.4s, v3.4s, #15                   \n"
52 			"st1    {v0.4s, v1.4s}, [%[output1]], #32   \n"
53 			"st1    {v2.4s, v3.4s}, [%[output2]], #32   \n"
54 			"b.ne   1b                                  \n"
55 			: /* output */
56 			  [chunk]"+r"(chunk),
57 			  [input]"+r"(input),
58 			  [output1]"+r"(output1),
59 			  [output2]"+r"(output2)
60 			: /* input */
61 			: /* clobber */
62 			  "v0", "v1", "v2", "v3", "memory", "cc"
63 			);
64 	}
65 
66 	/* The remaining samples. */
67 	while (frames--) {
68 		*output1++ = *input++ / 32768.0f;
69 		*output2++ = *input++ / 32768.0f;
70 	}
71 }
72 #define deinterleave_stereo deinterleave_stereo
73 
74 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
75  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
76  * from zero.
77  * Rounding is achieved by using fcvtas instruction. (a = away)
78  * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
79  * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
80  * which is 2.59 * 10^33.  A signed saturating add (sqadd) limits exponents
81  * from 240 to 255 to clamp to 255.
82  * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
83  * to the min or max value that fits an int.
84  * For other values, sqxtn clamps the output to -32768 to 32767 range.
85  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)86 static void interleave_stereo(float *input1, float *input2,
87 			      int16_t *output, int frames)
88 {
89 	/* Process 4 frames (8 samples) each loop. */
90 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
91 	int chunk = frames >> 2;
92 	frames &= 3;
93 
94 	if (chunk) {
95 		__asm__ __volatile__ (
96 			"dup    v2.4s, %w[scale]                    \n"
97 			"1:                                         \n"
98 			"ld1    {v0.4s}, [%[input1]], #16           \n"
99 			"ld1    {v1.4s}, [%[input2]], #16           \n"
100 			"subs   %w[chunk], %w[chunk], #1            \n"
101 			"sqadd  v0.4s, v0.4s, v2.4s                 \n"
102 			"sqadd  v1.4s, v1.4s, v2.4s                 \n"
103 			"fcvtas v0.4s, v0.4s                        \n"
104 			"fcvtas v1.4s, v1.4s                        \n"
105 			"sqxtn  v0.4h, v0.4s                        \n"
106 			"sqxtn  v1.4h, v1.4s                        \n"
107 			"st2    {v0.4h, v1.4h}, [%[output]], #16    \n"
108 			"b.ne   1b                                  \n"
109 			: /* output */
110 			  [chunk]"+r"(chunk),
111 			  [input1]"+r"(input1),
112 			  [input2]"+r"(input2),
113 			  [output]"+r"(output)
114 			: /* input */
115 			  [scale]"r"(15 << 23)
116 			: /* clobber */
117 			  "v0", "v1", "v2", "memory", "cc"
118 			);
119 	}
120 
121 	/* The remaining samples */
122 	while (frames--) {
123 		float f;
124 		f = *input1++ * 32768.0f;
125 		f += (f >= 0) ? 0.5f : -0.5f;
126 		*output++ = max(-32768, min(32767, (int)(f)));
127 		f = *input2++ * 32768.0f;
128 		f += (f >= 0) ? 0.5f : -0.5f;
129 		*output++ = max(-32768, min(32767, (int)(f)));
130 	}
131 }
132 #define interleave_stereo interleave_stereo
133 #endif
134 
135 #ifdef __ARM_NEON__
136 #include <arm_neon.h>
137 
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)138 static void deinterleave_stereo(int16_t *input, float *output1,
139 				float *output2, int frames)
140 {
141 	/* Process 8 frames (16 samples) each loop. */
142 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
143 	int chunk = frames >> 3;
144 	frames &= 7;
145 	if (chunk) {
146 		__asm__ __volatile__ (
147 			"1:					    \n"
148 			"vld2.16 {d0-d3}, [%[input]]!		    \n"
149 			"subs %[chunk], #1			    \n"
150 			"vmovl.s16 q3, d3			    \n"
151 			"vmovl.s16 q2, d2			    \n"
152 			"vmovl.s16 q1, d1			    \n"
153 			"vmovl.s16 q0, d0			    \n"
154 			"vcvt.f32.s32 q3, q3, #15		    \n"
155 			"vcvt.f32.s32 q2, q2, #15		    \n"
156 			"vcvt.f32.s32 q1, q1, #15		    \n"
157 			"vcvt.f32.s32 q0, q0, #15		    \n"
158 			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
159 			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
160 			"bne 1b					    \n"
161 			: /* output */
162 			  [chunk]"+r"(chunk),
163 			  [input]"+r"(input),
164 			  [output1]"+r"(output1),
165 			  [output2]"+r"(output2)
166 			: /* input */
167 			: /* clobber */
168 			  "q0", "q1", "q2", "q3", "memory", "cc"
169 			);
170 	}
171 
172 	/* The remaining samples. */
173 	while (frames--) {
174 		*output1++ = *input++ / 32768.0f;
175 		*output2++ = *input++ / 32768.0f;
176 	}
177 }
178 #define deinterleave_stereo deinterleave_stereo
179 
180 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
181  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
182  * from zero.
183  * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
184  * precision, and then converting float to fixed point using vcvt instruction
185  * which truncated toward zero.
186  * For very large values, beyond +/- 2 billion, vcvt will clamp the result
187  * to the min or max value that fits an int.
188  * For other values, vqmovn clamps the output to -32768 to 32767 range.
189  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)190 static void interleave_stereo(float *input1, float *input2,
191 			      int16_t *output, int frames)
192 {
193 	/* Process 4 frames (8 samples) each loop. */
194 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
195 	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
196 	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
197 	int chunk = frames >> 2;
198 	frames &= 3;
199 
200 	if (chunk) {
201 		__asm__ __volatile__ (
202 			"veor q0, q0, q0			    \n"
203 			"1:					    \n"
204 			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
205 			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
206 			"subs %[chunk], #1			    \n"
207 			/* We try to round to the nearest number by adding 0.5
208 			 * to positive input, and adding -0.5 to the negative
209 			 * input, then truncate.
210 			 */
211 			"vcgt.f32 q3, q1, q0			    \n"
212 			"vcgt.f32 q4, q2, q0			    \n"
213 			"vbsl q3, %q[pos], %q[neg]		    \n"
214 			"vbsl q4, %q[pos], %q[neg]		    \n"
215 			"vadd.f32 q1, q1, q3			    \n"
216 			"vadd.f32 q2, q2, q4			    \n"
217 			"vcvt.s32.f32 q1, q1, #15		    \n"
218 			"vcvt.s32.f32 q2, q2, #15		    \n"
219 			"vqmovn.s32 d2, q1			    \n"
220 			"vqmovn.s32 d3, q2			    \n"
221 			"vst2.16 {d2-d3}, [%[output]]!		    \n"
222 			"bne 1b					    \n"
223 			: /* output */
224 			  [chunk]"+r"(chunk),
225 			  [input1]"+r"(input1),
226 			  [input2]"+r"(input2),
227 			  [output]"+r"(output)
228 			: /* input */
229 			  [pos]"w"(pos),
230 			  [neg]"w"(neg)
231 			: /* clobber */
232 			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
233 			);
234 	}
235 
236 	/* The remaining samples */
237 	while (frames--) {
238 		float f;
239 		f = *input1++ * 32768.0f;
240 		f += (f >= 0) ? 0.5f : -0.5f;
241 		*output++ = max(-32768, min(32767, (int)(f)));
242 		f = *input2++ * 32768.0f;
243 		f += (f >= 0) ? 0.5f : -0.5f;
244 		*output++ = max(-32768, min(32767, (int)(f)));
245 	}
246 }
247 #define interleave_stereo interleave_stereo
248 #endif
249 
250 #ifdef __SSE3__
251 #include <emmintrin.h>
252 
253 /* Converts shorts in range of -32768 to 32767 to floats in range of
254  * -1.0f to 1.0f.
255  * pslld and psrad shifts are used to isolate the low and high word, but
256  * each in a different range:
257  * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
258  * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
259  * cvtdq2ps converts ints to floats as is.
260  * mulps is used to normalize the range of the low and high words, adjusting
261  * for high and low words being in different range.
262  */
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)263 static void deinterleave_stereo(int16_t *input, float *output1,
264 				float *output2, int frames)
265 {
266 	/* Process 8 frames (16 samples) each loop. */
267 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
268 	int chunk = frames >> 3;
269 	frames &= 7;
270 	if (chunk) {
271 		__asm__ __volatile__ (
272 			"1:                                         \n"
273 			"lddqu (%[input]), %%xmm0                   \n"
274 			"lddqu 16(%[input]), %%xmm1                 \n"
275 			"add $32, %[input]                          \n"
276 			"movdqa %%xmm0, %%xmm2                      \n"
277 			"movdqa %%xmm1, %%xmm3                      \n"
278 			"pslld $16, %%xmm0                          \n"
279 			"pslld $16, %%xmm1                          \n"
280 			"psrad $16, %%xmm2                          \n"
281 			"psrad $16, %%xmm3                          \n"
282 			"cvtdq2ps %%xmm0, %%xmm0                    \n"
283 			"cvtdq2ps %%xmm1, %%xmm1                    \n"
284 			"cvtdq2ps %%xmm2, %%xmm2                    \n"
285 			"cvtdq2ps %%xmm3, %%xmm3                    \n"
286 			"mulps %[scale_2_n31], %%xmm0               \n"
287 			"mulps %[scale_2_n31], %%xmm1               \n"
288 			"mulps %[scale_2_n15], %%xmm2               \n"
289 			"mulps %[scale_2_n15], %%xmm3               \n"
290 			"movdqu %%xmm0, (%[output1])                \n"
291 			"movdqu %%xmm1, 16(%[output1])              \n"
292 			"movdqu %%xmm2, (%[output2])                \n"
293 			"movdqu %%xmm3, 16(%[output2])              \n"
294 			"add $32, %[output1]                        \n"
295 			"add $32, %[output2]                        \n"
296 			"sub $1, %[chunk]                           \n"
297 			"jnz 1b                                     \n"
298 			: /* output */
299 			  [chunk]"+r"(chunk),
300 			  [input]"+r"(input),
301 			  [output1]"+r"(output1),
302 			  [output2]"+r"(output2)
303 			: /* input */
304 			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
305 			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
306 			: /* clobber */
307 			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
308 			);
309 	}
310 
311 	/* The remaining samples. */
312 	while (frames--) {
313 		*output1++ = *input++ / 32768.0f;
314 		*output2++ = *input++ / 32768.0f;
315 	}
316 }
317 #define deinterleave_stereo deinterleave_stereo
318 
319 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
320  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
321  * even.
322  * For very large values, beyond +/- 2 billion, cvtps2dq will produce
323  * 0x80000000 and packssdw will clamp -32768.
324  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)325 static void interleave_stereo(float *input1, float *input2,
326 			      int16_t *output, int frames)
327 {
328 	/* Process 4 frames (8 samples) each loop. */
329 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
330 	int chunk = frames >> 2;
331 	frames &= 3;
332 
333 	if (chunk) {
334 		__asm__ __volatile__ (
335 			"1:                                         \n"
336 			"lddqu (%[input1]), %%xmm0                  \n"
337 			"lddqu (%[input2]), %%xmm2                  \n"
338 			"add $16, %[input1]                         \n"
339 			"add $16, %[input2]                         \n"
340 			"movaps %%xmm0, %%xmm1                      \n"
341 			"unpcklps %%xmm2, %%xmm0                    \n"
342 			"unpckhps %%xmm2, %%xmm1                    \n"
343 			"paddsw %[scale_2_15], %%xmm0               \n"
344 			"paddsw %[scale_2_15], %%xmm1               \n"
345 			"cvtps2dq %%xmm0, %%xmm0                    \n"
346 			"cvtps2dq %%xmm1, %%xmm1                    \n"
347 			"packssdw %%xmm1, %%xmm0                    \n"
348 			"movdqu %%xmm0, (%[output])                 \n"
349 			"add $16, %[output]                         \n"
350 			"sub $1, %[chunk]                           \n"
351 			"jnz 1b                                     \n"
352 			: /* output */
353 			  [chunk]"+r"(chunk),
354 			  [input1]"+r"(input1),
355 			  [input2]"+r"(input2),
356 			  [output]"+r"(output)
357 			: /* input */
358 			  [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
359 			  [clamp_large]"x"(_mm_set1_ps(32767.0f))
360 			: /* clobber */
361 			  "xmm0", "xmm1", "xmm2", "memory", "cc"
362 			);
363 	}
364 
365 	/* The remaining samples */
366 	while (frames--) {
367 		float f;
368 		f = *input1++ * 32768.0f;
369 		f += (f >= 0) ? 0.5f : -0.5f;
370 		*output++ = max(-32768, min(32767, (int)(f)));
371 		f = *input2++ * 32768.0f;
372 		f += (f >= 0) ? 0.5f : -0.5f;
373 		*output++ = max(-32768, min(32767, (int)(f)));
374 	}
375 }
376 #define interleave_stereo interleave_stereo
377 #endif
378 
dsp_util_deinterleave_s16le(int16_t * input,float * const * output,int channels,int frames)379 static void dsp_util_deinterleave_s16le(int16_t *input, float *const *output,
380 				 int channels, int frames)
381 {
382 	float *output_ptr[channels];
383 	int i, j;
384 
385 #ifdef deinterleave_stereo
386 	if (channels == 2) {
387 		deinterleave_stereo(input, output[0], output[1], frames);
388 		return;
389 	}
390 #endif
391 
392 	for (i = 0; i < channels; i++)
393 		output_ptr[i] = output[i];
394 
395 	for (i = 0; i < frames; i++)
396 		for (j = 0; j < channels; j++)
397 			*(output_ptr[j]++) = *input++ / 32768.0f;
398 }
399 
400 
dsp_util_deinterleave_s24le(int32_t * input,float * const * output,int channels,int frames)401 static void dsp_util_deinterleave_s24le(int32_t *input, float *const *output,
402 					int channels, int frames)
403 {
404 	float *output_ptr[channels];
405 	int i, j;
406 
407 	for (i = 0; i < channels; i++)
408 		output_ptr[i] = output[i];
409 
410 	for (i = 0; i < frames; i++)
411 		for (j = 0; j < channels; j++, input++)
412 			*(output_ptr[j]++) =
413 				(*input << 8) / 2147483648.0f;
414 }
415 
dsp_util_deinterleave_s243le(uint8_t * input,float * const * output,int channels,int frames)416 static void dsp_util_deinterleave_s243le(uint8_t *input, float *const *output,
417 					 int channels, int frames)
418 {
419 	float *output_ptr[channels];
420 	int32_t sample;
421 	int i, j;
422 
423 	for (i = 0; i < channels; i++)
424 		output_ptr[i] = output[i];
425 
426 	for (i = 0; i < frames; i++)
427 		for (j = 0; j < channels; j++, input += 3) {
428 			sample = 0;
429 			memcpy((uint8_t *)&sample + 1, input, 3);
430 			*(output_ptr[j]++) = sample / 2147483648.0f;
431 		}
432 }
433 
dsp_util_deinterleave_s32le(int32_t * input,float * const * output,int channels,int frames)434 static void dsp_util_deinterleave_s32le(int32_t *input, float *const *output,
435 					int channels, int frames)
436 {
437 	float *output_ptr[channels];
438 	int i, j;
439 
440 	for (i = 0; i < channels; i++)
441 		output_ptr[i] = output[i];
442 
443 	for (i = 0; i < frames; i++)
444 		for (j = 0; j < channels; j++, input++)
445 			*(output_ptr[j]++) = *input / 2147483648.0f;
446 }
447 
dsp_util_deinterleave(uint8_t * input,float * const * output,int channels,snd_pcm_format_t format,int frames)448 int dsp_util_deinterleave(uint8_t *input, float *const *output, int channels,
449 			  snd_pcm_format_t format, int frames)
450 {
451 	switch (format) {
452 	case SND_PCM_FORMAT_S16_LE:
453 		dsp_util_deinterleave_s16le((int16_t *)input, output,
454 					    channels, frames);
455 		break;
456 	case SND_PCM_FORMAT_S24_LE:
457 		dsp_util_deinterleave_s24le((int32_t *)input, output,
458 					  channels, frames);
459 		break;
460 	case SND_PCM_FORMAT_S24_3LE:
461 		dsp_util_deinterleave_s243le(input, output,
462 					     channels, frames);
463 		break;
464 	case SND_PCM_FORMAT_S32_LE:
465 		dsp_util_deinterleave_s32le((int32_t *)input, output,
466 					     channels, frames);
467 		break;
468 	default:
469 		syslog(LOG_ERR, "Invalid format to deinterleave");
470 		return -EINVAL;
471 	}
472 	return 0;
473 }
474 
dsp_util_interleave_s16le(float * const * input,int16_t * output,int channels,int frames)475 static void dsp_util_interleave_s16le(float *const *input, int16_t *output,
476 				      int channels, int frames)
477 {
478 	float *input_ptr[channels];
479 	int i, j;
480 
481 #ifdef interleave_stereo
482 	if (channels == 2) {
483 		interleave_stereo(input[0], input[1], output, frames);
484 		return;
485 	}
486 #endif
487 
488 	for (i = 0; i < channels; i++)
489 		input_ptr[i] = input[i];
490 
491 	for (i = 0; i < frames; i++)
492 		for (j = 0; j < channels; j++) {
493 			float f = *(input_ptr[j]++) * 32768.0f;
494 			f += (f >= 0) ? 0.5f : -0.5f;
495 			*output++ = max(-32768, min(32767, (int)(f)));
496 		}
497 }
498 
dsp_util_interleave_s24le(float * const * input,int32_t * output,int channels,int frames)499 static void dsp_util_interleave_s24le(float *const *input, int32_t *output,
500 				      int channels, int frames)
501 {
502 	float *input_ptr[channels];
503 	int i, j;
504 
505 	for (i = 0; i < channels; i++)
506 		input_ptr[i] = input[i];
507 
508 	for (i = 0; i < frames; i++)
509 		for (j = 0; j < channels; j++, output++) {
510 			float f = *(input_ptr[j]++) * 2147483648.0f;
511 			f += (f >= 0) ? 0.5f : -0.5f;
512 			*output = max((float)INT_MIN, min((float)INT_MAX, f));
513 			*output >>= 8;
514 		}
515 }
516 
dsp_util_interleave_s243le(float * const * input,uint8_t * output,int channels,int frames)517 static void dsp_util_interleave_s243le(float *const *input, uint8_t *output,
518 				       int channels, int frames)
519 {
520 	float *input_ptr[channels];
521 	int i, j;
522 	int32_t tmp;
523 
524 	for (i = 0; i < channels; i++)
525 		input_ptr[i] = input[i];
526 
527 	for (i = 0; i < frames; i++)
528 		for (j = 0; j < channels; j++, output += 3) {
529 			float f = *(input_ptr[j]++) * 2147483648.0f;
530 			f += (f >= 0) ? 0.5f : -0.5f;
531 			tmp = max((float)INT_MIN, min((float)INT_MAX, f));
532 			tmp >>= 8;
533 			memcpy(output, &tmp, 3);
534 		}
535 }
536 
dsp_util_interleave_s32le(float * const * input,int32_t * output,int channels,int frames)537 static void dsp_util_interleave_s32le(float *const *input, int32_t *output,
538 				      int channels, int frames)
539 {
540 	float *input_ptr[channels];
541 	int i, j;
542 
543 	for (i = 0; i < channels; i++)
544 		input_ptr[i] = input[i];
545 
546 	for (i = 0; i < frames; i++)
547 		for (j = 0; j < channels; j++, output++) {
548 			float f = *(input_ptr[j]++) * 2147483648.0f;
549 			f += (f >= 0) ? 0.5f : -0.5f;
550 			*output = max((float)INT_MIN, min((float)INT_MAX, f));
551 		}
552 }
553 
dsp_util_interleave(float * const * input,uint8_t * output,int channels,snd_pcm_format_t format,int frames)554 int dsp_util_interleave(float *const *input, uint8_t *output, int channels,
555 			snd_pcm_format_t format, int frames)
556 {
557 	switch (format) {
558 	case SND_PCM_FORMAT_S16_LE:
559 		dsp_util_interleave_s16le(input, (int16_t *)output,
560 					  channels, frames);
561 		break;
562 	case SND_PCM_FORMAT_S24_LE:
563 		dsp_util_interleave_s24le(input, (int32_t *)output,
564 					  channels, frames);
565 		break;
566 	case SND_PCM_FORMAT_S24_3LE:
567 		dsp_util_interleave_s243le(input, output, channels, frames);
568 		break;
569 	case SND_PCM_FORMAT_S32_LE:
570 		dsp_util_interleave_s32le(input, (int32_t *)output,
571 					  channels, frames);
572 		break;
573 	default:
574 		syslog(LOG_ERR, "Invalid format to interleave");
575 		return -EINVAL;
576 	}
577 	return 0;
578 }
579 
dsp_enable_flush_denormal_to_zero()580 void dsp_enable_flush_denormal_to_zero()
581 {
582 #if defined(__i386__) || defined(__x86_64__)
583 	unsigned int mxcsr;
584 	mxcsr = __builtin_ia32_stmxcsr();
585 	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
586 #elif defined(__aarch64__)
587 	uint64_t cw;
588 	__asm__ __volatile__ (
589 		"mrs    %0, fpcr			    \n"
590 		"orr    %0, %0, #0x1000000		    \n"
591 		"msr    fpcr, %0			    \n"
592 		"isb					    \n"
593 		: "=r"(cw) :: "memory");
594 #elif defined(__arm__)
595 	uint32_t cw;
596 	__asm__ __volatile__ (
597 		"vmrs   %0, fpscr			    \n"
598 		"orr    %0, %0, #0x1000000		    \n"
599 		"vmsr   fpscr, %0			    \n"
600 		: "=r"(cw) :: "memory");
601 #else
602 #warning "Don't know how to disable denorms. Performace may suffer."
603 #endif
604 }
605