1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include "dsp_util.h"
7
8 #ifndef max
9 #define max(a, b) ({ __typeof__(a) _a = (a); \
10 __typeof__(b) _b = (b); \
11 _a > _b ? _a : _b; })
12 #endif
13
14 #ifndef min
15 #define min(a, b) ({ __typeof__(a) _a = (a); \
16 __typeof__(b) _b = (b); \
17 _a < _b ? _a : _b; })
18 #endif
19
20 #undef deinterleave_stereo
21 #undef interleave_stereo
22
23 #ifdef __ARM_NEON__
24 #include <arm_neon.h>
25
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)26 static void deinterleave_stereo(int16_t *input, float *output1,
27 float *output2, int frames)
28 {
29 /* Process 8 frames (16 samples) each loop. */
30 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
31 int chunk = frames >> 3;
32 frames &= 7;
33 if (chunk) {
34 __asm__ __volatile__ (
35 "1: \n"
36 "vld2.16 {d0-d3}, [%[input]]! \n"
37 "subs %[chunk], #1 \n"
38 "vmovl.s16 q3, d3 \n"
39 "vmovl.s16 q2, d2 \n"
40 "vmovl.s16 q1, d1 \n"
41 "vmovl.s16 q0, d0 \n"
42 "vcvt.f32.s32 q3, q3, #15 \n"
43 "vcvt.f32.s32 q2, q2, #15 \n"
44 "vcvt.f32.s32 q1, q1, #15 \n"
45 "vcvt.f32.s32 q0, q0, #15 \n"
46 "vst1.32 {d4-d7}, [%[output2]]! \n"
47 "vst1.32 {d0-d3}, [%[output1]]! \n"
48 "bne 1b \n"
49 : /* output */
50 [chunk]"+r"(chunk),
51 [input]"+r"(input),
52 [output1]"+r"(output1),
53 [output2]"+r"(output2)
54 : /* input */
55 : /* clobber */
56 "q0", "q1", "q2", "q3", "memory", "cc"
57 );
58 }
59
60 /* The remaining samples. */
61 while (frames--) {
62 *output1++ = *input++ / 32768.0f;
63 *output2++ = *input++ / 32768.0f;
64 }
65 }
66 #define deinterleave_stereo deinterleave_stereo
67
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)68 static void interleave_stereo(float *input1, float *input2,
69 int16_t *output, int frames)
70 {
71 /* Process 4 frames (8 samples) each loop. */
72 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
73 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
74 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
75 int chunk = frames >> 2;
76 frames &= 3;
77
78 if (chunk) {
79 __asm__ __volatile__ (
80 "veor q0, q0, q0 \n"
81 "1: \n"
82 "vld1.32 {d2-d3}, [%[input1]]! \n"
83 "vld1.32 {d4-d5}, [%[input2]]! \n"
84 "subs %[chunk], #1 \n"
85 /* We try to round to the nearest number by adding 0.5
86 * to positive input, and adding -0.5 to the negative
87 * input, then truncate.
88 */
89 "vcgt.f32 q3, q1, q0 \n"
90 "vcgt.f32 q4, q2, q0 \n"
91 "vbsl q3, %q[pos], %q[neg] \n"
92 "vbsl q4, %q[pos], %q[neg] \n"
93 "vadd.f32 q1, q1, q3 \n"
94 "vadd.f32 q2, q2, q4 \n"
95 "vcvt.s32.f32 q1, q1, #15 \n"
96 "vcvt.s32.f32 q2, q2, #15 \n"
97 "vqmovn.s32 d2, q1 \n"
98 "vqmovn.s32 d3, q2 \n"
99 "vst2.16 {d2-d3}, [%[output]]! \n"
100 "bne 1b \n"
101 : /* output */
102 "=r"(chunk),
103 "=r"(input1),
104 "=r"(input2),
105 "=r"(output)
106 : /* input */
107 [chunk]"0"(chunk),
108 [input1]"1"(input1),
109 [input2]"2"(input2),
110 [output]"3"(output),
111 [pos]"w"(pos),
112 [neg]"w"(neg)
113 : /* clobber */
114 "q0", "q1", "q2", "q3", "q4", "memory", "cc"
115 );
116 }
117
118 /* The remaining samples */
119 while (frames--) {
120 float f;
121 f = *input1++;
122 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
123 *output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
124 f = *input2++;
125 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
126 *output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
127 }
128 }
129 #define interleave_stereo interleave_stereo
130
131 #endif
132
133 #ifdef __SSE3__
134 #include <emmintrin.h>
135
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)136 static void deinterleave_stereo(int16_t *input, float *output1,
137 float *output2, int frames)
138 {
139 /* Process 8 frames (16 samples) each loop. */
140 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
141 int chunk = frames >> 3;
142 frames &= 7;
143 if (chunk) {
144 __asm__ __volatile__ (
145 "1: \n"
146 "lddqu (%[input]), %%xmm0 \n"
147 "lddqu 16(%[input]), %%xmm1 \n"
148 "add $32, %[input] \n"
149 "movdqa %%xmm0, %%xmm2 \n"
150 "movdqa %%xmm1, %%xmm3 \n"
151 "pslld $16, %%xmm0 \n"
152 "pslld $16, %%xmm1 \n"
153 "psrad $16, %%xmm2 \n"
154 "psrad $16, %%xmm3 \n"
155 "cvtdq2ps %%xmm0, %%xmm0 \n"
156 "cvtdq2ps %%xmm1, %%xmm1 \n"
157 "cvtdq2ps %%xmm2, %%xmm2 \n"
158 "cvtdq2ps %%xmm3, %%xmm3 \n"
159 "mulps %[scale_2_n31], %%xmm0 \n"
160 "mulps %[scale_2_n31], %%xmm1 \n"
161 "mulps %[scale_2_n15], %%xmm2 \n"
162 "mulps %[scale_2_n15], %%xmm3 \n"
163 "movdqu %%xmm0, (%[output1]) \n"
164 "movdqu %%xmm1, 16(%[output1]) \n"
165 "movdqu %%xmm2, (%[output2]) \n"
166 "movdqu %%xmm3, 16(%[output2]) \n"
167 "add $32, %[output1] \n"
168 "add $32, %[output2] \n"
169 "sub $1, %[chunk] \n"
170 "jnz 1b \n"
171 : /* output */
172 [chunk]"+r"(chunk),
173 [input]"+r"(input),
174 [output1]"+r"(output1),
175 [output2]"+r"(output2)
176 : /* input */
177 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
178 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
179 : /* clobber */
180 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
181 );
182 }
183
184 /* The remaining samples. */
185 while (frames--) {
186 *output1++ = *input++ / 32768.0f;
187 *output2++ = *input++ / 32768.0f;
188 }
189 }
190 #define deinterleave_stereo deinterleave_stereo
191
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)192 static void interleave_stereo(float *input1, float *input2,
193 int16_t *output, int frames)
194 {
195 /* Process 4 frames (8 samples) each loop. */
196 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
197 int chunk = frames >> 2;
198 frames &= 3;
199
200 if (chunk) {
201 __asm__ __volatile__ (
202 "1: \n"
203 "lddqu (%[input1]), %%xmm0 \n"
204 "lddqu (%[input2]), %%xmm2 \n"
205 "movaps %%xmm0, %%xmm1 \n"
206 "unpcklps %%xmm2, %%xmm0 \n"
207 "unpckhps %%xmm2, %%xmm1 \n"
208 "add $16, %[input1] \n"
209 "add $16, %[input2] \n"
210 "mulps %[scale_2_15], %%xmm0 \n"
211 "mulps %[scale_2_15], %%xmm1 \n"
212 "cvtps2dq %%xmm0, %%xmm0 \n"
213 "cvtps2dq %%xmm1, %%xmm1 \n"
214 "packssdw %%xmm1, %%xmm0 \n"
215 "movdqu %%xmm0, (%[output]) \n"
216 "add $16, %[output] \n"
217 "sub $1, %[chunk] \n"
218 "jnz 1b \n"
219 : /* output */
220 "=r"(chunk),
221 "=r"(input1),
222 "=r"(input2),
223 "=r"(output)
224 : /* input */
225 [chunk]"0"(chunk),
226 [input1]"1"(input1),
227 [input2]"2"(input2),
228 [output]"3"(output),
229 [scale_2_15]"x"(_mm_set1_ps(1.0f*(1<<15)))
230 : /* clobber */
231 "xmm0", "xmm1", "xmm2", "memory", "cc"
232 );
233 }
234
235 /* The remaining samples */
236 while (frames--) {
237 float f;
238 f = *input1++;
239 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
240 *output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
241 f = *input2++;
242 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
243 *output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
244 }
245 }
246 #define interleave_stereo interleave_stereo
247
248 #endif
249
dsp_util_deinterleave(int16_t * input,float * const * output,int channels,int frames)250 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
251 int frames)
252 {
253 float *output_ptr[channels];
254 int i, j;
255
256 #ifdef deinterleave_stereo
257 if (channels == 2) {
258 deinterleave_stereo(input, output[0], output[1], frames);
259 return;
260 }
261 #endif
262
263 for (i = 0; i < channels; i++)
264 output_ptr[i] = output[i];
265
266 for (i = 0; i < frames; i++)
267 for (j = 0; j < channels; j++)
268 *(output_ptr[j]++) = *input++ / 32768.0f;
269 }
270
dsp_util_interleave(float * const * input,int16_t * output,int channels,int frames)271 void dsp_util_interleave(float *const *input, int16_t *output, int channels,
272 int frames)
273 {
274 float *input_ptr[channels];
275 int i, j;
276
277 #ifdef interleave_stereo
278 if (channels == 2) {
279 interleave_stereo(input[0], input[1], output, frames);
280 return;
281 }
282 #endif
283
284 for (i = 0; i < channels; i++)
285 input_ptr[i] = input[i];
286
287 for (i = 0; i < frames; i++)
288 for (j = 0; j < channels; j++) {
289 int16_t i16;
290 float f = *(input_ptr[j]++) * 32768.0f;
291 if (f > 32767)
292 i16 = 32767;
293 else if (f < -32768)
294 i16 = -32768;
295 else
296 i16 = (int16_t) (f > 0 ? f + 0.5f : f - 0.5f);
297 *output++ = i16;
298 }
299 }
300
dsp_enable_flush_denormal_to_zero()301 void dsp_enable_flush_denormal_to_zero()
302 {
303 #if defined(__i386__) || defined(__x86_64__)
304 unsigned int mxcsr;
305 mxcsr = __builtin_ia32_stmxcsr();
306 __builtin_ia32_ldmxcsr(mxcsr | 0x8040);
307 #elif defined(__arm__)
308 int cw;
309 __asm__ __volatile__ ("mrc p10, 7, %0, cr1, cr0, 0" : "=r" (cw));
310 __asm__ __volatile__ ("mcr p10, 7, %0, cr1, cr0, 0" : : "r" (cw | (1 << 24)));
311 #else
312 #warning "Don't know how to disable denorms. Performace may suffer."
313 #endif
314 }
315