1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <string.h>
7 #include "crossover2.h"
8 #include "biquad.h"
9 
lr42_set(struct lr42 * lr42,enum biquad_type type,float freq)10 static void lr42_set(struct lr42 *lr42, enum biquad_type type, float freq)
11 {
12 	struct biquad q;
13 	biquad_set(&q, type, freq, 0, 0);
14 	memset(lr42, 0, sizeof(*lr42));
15 	lr42->b0 = q.b0;
16 	lr42->b1 = q.b1;
17 	lr42->b2 = q.b2;
18 	lr42->a1 = q.a1;
19 	lr42->a2 = q.a2;
20 }
21 
22 /* Split input data using two LR4 filters, put the result into the input array
23  * and another array.
24  *
25  * data0 --+-- lp --> data0
26  *         |
27  *         \-- hp --> data1
28  */
29 #if defined(__ARM_NEON__)
30 #include <arm_neon.h>
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)31 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
32 		       float *data0L, float *data0R, float *data1L,
33 		       float *data1R)
34 {
35 	float32x4_t x1 = { lp->x1L, hp->x1L, lp->x1R, hp->x1R };
36 	float32x4_t x2 = { lp->x2L, hp->x2L, lp->x2R, hp->x2R };
37 	float32x4_t y1 = { lp->y1L, hp->y1L, lp->y1R, hp->y1R };
38 	float32x4_t y2 = { lp->y2L, hp->y2L, lp->y2R, hp->y2R };
39 	float32x4_t z1 = { lp->z1L, hp->z1L, lp->z1R, hp->z1R };
40 	float32x4_t z2 = { lp->z2L, hp->z2L, lp->z2R, hp->z2R };
41 	float32x4_t b0 = { lp->b0, hp->b0, lp->b0, hp->b0 };
42 	float32x4_t b1 = { lp->b1, hp->b1, lp->b1, hp->b1 };
43 	float32x4_t b2 = { lp->b2, hp->b2, lp->b2, hp->b2 };
44 	float32x4_t a1 = { lp->a1, hp->a1, lp->a1, hp->a1 };
45 	float32x4_t a2 = { lp->a2, hp->a2, lp->a2, hp->a2 };
46 
47 	// clang-format off
48 	__asm__ __volatile__(
49 		/* q0 = x, q1 = y, q2 = z */
50 		"1:                                     \n"
51 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
52 		"vld1.32 d0[], [%[data0L]]              \n"
53 		"vld1.32 d1[], [%[data0R]]              \n"
54 		"subs %[count], #1                      \n"
55 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
56 		"vmla.f32 q1, %q[b0], q0                \n"
57 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
58 		"vmov.f32 %q[x2], %q[x1]                \n"
59 		"vmov.f32 %q[x1], q0                    \n"
60 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
61 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
62 		"vmla.f32 q2, %q[b0], q1                \n"
63 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
64 		"vmov.f32 %q[y2], %q[y1]                \n"
65 		"vmov.f32 %q[y1], q1                    \n"
66 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
67 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
68 		"vmov.f32 %q[z2], %q[z1]                \n"
69 		"vmov.f32 %q[z1], q2                    \n"
70 		"vst1.f32 d4[0], [%[data0L]]!           \n"
71 		"vst1.f32 d4[1], [%[data1L]]!           \n"
72 		"vst1.f32 d5[0], [%[data0R]]!           \n"
73 		"vst1.f32 d5[1], [%[data1R]]!           \n"
74 		"bne 1b                                 \n"
75 		: /* output */
76 		  "=r"(data0L),
77 		  "=r"(data0R),
78 		  "=r"(data1L),
79 		  "=r"(data1R),
80 		  "=r"(count),
81 		  [x1]"+w"(x1),
82 		  [x2]"+w"(x2),
83 		  [y1]"+w"(y1),
84 		  [y2]"+w"(y2),
85 		  [z1]"+w"(z1),
86 		  [z2]"+w"(z2)
87 		: /* input */
88 		  [data0L]"0"(data0L),
89 		  [data0R]"1"(data0R),
90 		  [data1L]"2"(data1L),
91 		  [data1R]"3"(data1R),
92 		  [count]"4"(count),
93 		  [b0]"w"(b0),
94 		  [b1]"w"(b1),
95 		  [b2]"w"(b2),
96 		  [a1]"w"(a1),
97 		  [a2]"w"(a2)
98 		: /* clobber */
99 		  "q0", "q1", "q2", "memory", "cc");
100 	// clang-format on
101 
102 	lp->x1L = x1[0];
103 	lp->x1R = x1[2];
104 	lp->x2L = x2[0];
105 	lp->x2R = x2[2];
106 	lp->y1L = y1[0];
107 	lp->y1R = y1[2];
108 	lp->y2L = y2[0];
109 	lp->y2R = y2[2];
110 	lp->z1L = z1[0];
111 	lp->z1R = z1[2];
112 	lp->z2L = z2[0];
113 	lp->z2R = z2[2];
114 
115 	hp->x1L = x1[1];
116 	hp->x1R = x1[3];
117 	hp->x2L = x2[1];
118 	hp->x2R = x2[3];
119 	hp->y1L = y1[1];
120 	hp->y1R = y1[3];
121 	hp->y2L = y2[1];
122 	hp->y2R = y2[3];
123 	hp->z1L = z1[1];
124 	hp->z1R = z1[3];
125 	hp->z2L = z2[1];
126 	hp->z2R = z2[3];
127 }
128 #elif defined(__SSE3__) && defined(__x86_64__)
129 #include <emmintrin.h>
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)130 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
131 		       float *data0L, float *data0R, float *data1L,
132 		       float *data1R)
133 {
134 	__m128 x1 = { lp->x1L, hp->x1L, lp->x1R, hp->x1R };
135 	__m128 x2 = { lp->x2L, hp->x2L, lp->x2R, hp->x2R };
136 	__m128 y1 = { lp->y1L, hp->y1L, lp->y1R, hp->y1R };
137 	__m128 y2 = { lp->y2L, hp->y2L, lp->y2R, hp->y2R };
138 	__m128 z1 = { lp->z1L, hp->z1L, lp->z1R, hp->z1R };
139 	__m128 z2 = { lp->z2L, hp->z2L, lp->z2R, hp->z2R };
140 	__m128 b0 = { lp->b0, hp->b0, lp->b0, hp->b0 };
141 	__m128 b1 = { lp->b1, hp->b1, lp->b1, hp->b1 };
142 	__m128 b2 = { lp->b2, hp->b2, lp->b2, hp->b2 };
143 	__m128 a1 = { lp->a1, hp->a1, lp->a1, hp->a1 };
144 	__m128 a2 = { lp->a2, hp->a2, lp->a2, hp->a2 };
145 
146 	// clang-format off
147 	__asm__ __volatile__(
148 		"1:                                     \n"
149 		"movss (%[data0L]), %%xmm2              \n"
150 		"movss (%[data0R]), %%xmm1              \n"
151 		"shufps $0, %%xmm1, %%xmm2              \n"
152 		"mulps %[b2],%[x2]                      \n"
153 		"movaps %[b0], %%xmm0                   \n"
154 		"mulps %[a2],%[z2]                      \n"
155 		"movaps %[b1], %%xmm1                   \n"
156 		"mulps %%xmm2,%%xmm0                    \n"
157 		"mulps %[x1],%%xmm1                     \n"
158 		"addps %%xmm1,%%xmm0                    \n"
159 		"movaps %[a1],%%xmm1                    \n"
160 		"mulps %[y1],%%xmm1                     \n"
161 		"addps %[x2],%%xmm0                     \n"
162 		"movaps %[b1],%[x2]                     \n"
163 		"mulps %[y1],%[x2]                      \n"
164 		"subps %%xmm1,%%xmm0                    \n"
165 		"movaps %[a2],%%xmm1                    \n"
166 		"mulps %[y2],%%xmm1                     \n"
167 		"mulps %[b2],%[y2]                      \n"
168 		"subps %%xmm1,%%xmm0                    \n"
169 		"movaps %[b0],%%xmm1                    \n"
170 		"mulps %%xmm0,%%xmm1                    \n"
171 		"addps %[x2],%%xmm1                     \n"
172 		"movaps %[x1],%[x2]                     \n"
173 		"movaps %%xmm2,%[x1]                    \n"
174 		"addps %[y2],%%xmm1                     \n"
175 		"movaps %[a1],%[y2]                     \n"
176 		"mulps %[z1],%[y2]                      \n"
177 		"subps %[y2],%%xmm1                     \n"
178 		"movaps %[y1],%[y2]                     \n"
179 		"movaps %%xmm0,%[y1]                    \n"
180 		"subps %[z2],%%xmm1                     \n"
181 		"movaps %[z1],%[z2]                     \n"
182 		"movaps %%xmm1,%[z1]                    \n"
183 		"movss %%xmm1, (%[data0L])              \n"
184 		"shufps $0x39, %%xmm1, %%xmm1           \n"
185 		"movss %%xmm1, (%[data1L])              \n"
186 		"shufps $0x39, %%xmm1, %%xmm1           \n"
187 		"movss %%xmm1, (%[data0R])              \n"
188 		"shufps $0x39, %%xmm1, %%xmm1           \n"
189 		"movss %%xmm1, (%[data1R])              \n"
190 		"add $4, %[data0L]                      \n"
191 		"add $4, %[data1L]                      \n"
192 		"add $4, %[data0R]                      \n"
193 		"add $4, %[data1R]                      \n"
194 		"sub $1, %[count]                       \n"
195 		"jnz 1b                                 \n"
196 		: /* output */
197 		  [data0L]"+r"(data0L),
198 		  [data0R]"+r"(data0R),
199 		  [data1L]"+r"(data1L),
200 		  [data1R]"+r"(data1R),
201 		  [count]"+r"(count),
202 		  [x1]"+x"(x1),
203 		  [x2]"+x"(x2),
204 		  [y1]"+x"(y1),
205 		  [y2]"+x"(y2),
206 		  [z1]"+x"(z1),
207 		  [z2]"+x"(z2)
208 		: /* input */
209 		  [b0]"x"(b0),
210 		  [b1]"x"(b1),
211 		  [b2]"x"(b2),
212 		  [a1]"x"(a1),
213 		  [a2]"x"(a2)
214 		: /* clobber */
215 		  "xmm0", "xmm1", "xmm2", "memory", "cc");
216 	// clang-format on
217 
218 	lp->x1L = x1[0];
219 	lp->x1R = x1[2];
220 	lp->x2L = x2[0];
221 	lp->x2R = x2[2];
222 	lp->y1L = y1[0];
223 	lp->y1R = y1[2];
224 	lp->y2L = y2[0];
225 	lp->y2R = y2[2];
226 	lp->z1L = z1[0];
227 	lp->z1R = z1[2];
228 	lp->z2L = z2[0];
229 	lp->z2R = z2[2];
230 
231 	hp->x1L = x1[1];
232 	hp->x1R = x1[3];
233 	hp->x2L = x2[1];
234 	hp->x2R = x2[3];
235 	hp->y1L = y1[1];
236 	hp->y1R = y1[3];
237 	hp->y2L = y2[1];
238 	hp->y2R = y2[3];
239 	hp->z1L = z1[1];
240 	hp->z1R = z1[3];
241 	hp->z2L = z2[1];
242 	hp->z2R = z2[3];
243 }
244 #else
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)245 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
246 		       float *data0L, float *data0R, float *data1L,
247 		       float *data1R)
248 {
249 	float lx1L = lp->x1L, lx1R = lp->x1R;
250 	float lx2L = lp->x2L, lx2R = lp->x2R;
251 	float ly1L = lp->y1L, ly1R = lp->y1R;
252 	float ly2L = lp->y2L, ly2R = lp->y2R;
253 	float lz1L = lp->z1L, lz1R = lp->z1R;
254 	float lz2L = lp->z2L, lz2R = lp->z2R;
255 	float lb0 = lp->b0;
256 	float lb1 = lp->b1;
257 	float lb2 = lp->b2;
258 	float la1 = lp->a1;
259 	float la2 = lp->a2;
260 
261 	float hx1L = hp->x1L, hx1R = hp->x1R;
262 	float hx2L = hp->x2L, hx2R = hp->x2R;
263 	float hy1L = hp->y1L, hy1R = hp->y1R;
264 	float hy2L = hp->y2L, hy2R = hp->y2R;
265 	float hz1L = hp->z1L, hz1R = hp->z1R;
266 	float hz2L = hp->z2L, hz2R = hp->z2R;
267 	float hb0 = hp->b0;
268 	float hb1 = hp->b1;
269 	float hb2 = hp->b2;
270 	float ha1 = hp->a1;
271 	float ha2 = hp->a2;
272 
273 	int i;
274 	for (i = 0; i < count; i++) {
275 		float xL, yL, zL, xR, yR, zR;
276 		xL = data0L[i];
277 		xR = data0R[i];
278 		yL = lb0 * xL + lb1 * lx1L + lb2 * lx2L - la1 * ly1L -
279 		     la2 * ly2L;
280 		yR = lb0 * xR + lb1 * lx1R + lb2 * lx2R - la1 * ly1R -
281 		     la2 * ly2R;
282 		zL = lb0 * yL + lb1 * ly1L + lb2 * ly2L - la1 * lz1L -
283 		     la2 * lz2L;
284 		zR = lb0 * yR + lb1 * ly1R + lb2 * ly2R - la1 * lz1R -
285 		     la2 * lz2R;
286 		lx2L = lx1L;
287 		lx2R = lx1R;
288 		lx1L = xL;
289 		lx1R = xR;
290 		ly2L = ly1L;
291 		ly2R = ly1R;
292 		ly1L = yL;
293 		ly1R = yR;
294 		lz2L = lz1L;
295 		lz2R = lz1R;
296 		lz1L = zL;
297 		lz1R = zR;
298 		data0L[i] = zL;
299 		data0R[i] = zR;
300 
301 		yL = hb0 * xL + hb1 * hx1L + hb2 * hx2L - ha1 * hy1L -
302 		     ha2 * hy2L;
303 		yR = hb0 * xR + hb1 * hx1R + hb2 * hx2R - ha1 * hy1R -
304 		     ha2 * hy2R;
305 		zL = hb0 * yL + hb1 * hy1L + hb2 * hy2L - ha1 * hz1L -
306 		     ha2 * hz2L;
307 		zR = hb0 * yR + hb1 * hy1R + hb2 * hy2R - ha1 * hz1R -
308 		     ha2 * hz2R;
309 		hx2L = hx1L;
310 		hx2R = hx1R;
311 		hx1L = xL;
312 		hx1R = xR;
313 		hy2L = hy1L;
314 		hy2R = hy1R;
315 		hy1L = yL;
316 		hy1R = yR;
317 		hz2L = hz1L;
318 		hz2R = hz1R;
319 		hz1L = zL;
320 		hz1R = zR;
321 		data1L[i] = zL;
322 		data1R[i] = zR;
323 	}
324 
325 	lp->x1L = lx1L;
326 	lp->x1R = lx1R;
327 	lp->x2L = lx2L;
328 	lp->x2R = lx2R;
329 	lp->y1L = ly1L;
330 	lp->y1R = ly1R;
331 	lp->y2L = ly2L;
332 	lp->y2R = ly2R;
333 	lp->z1L = lz1L;
334 	lp->z1R = lz1R;
335 	lp->z2L = lz2L;
336 	lp->z2R = lz2R;
337 
338 	hp->x1L = hx1L;
339 	hp->x1R = hx1R;
340 	hp->x2L = hx2L;
341 	hp->x2R = hx2R;
342 	hp->y1L = hy1L;
343 	hp->y1R = hy1R;
344 	hp->y2L = hy2L;
345 	hp->y2R = hy2R;
346 	hp->z1L = hz1L;
347 	hp->z1R = hz1R;
348 	hp->z2L = hz2L;
349 	hp->z2R = hz2R;
350 }
351 #endif
352 
353 /* Split input data using two LR4 filters and sum them back to the original
354  * data array.
355  *
356  * data --+-- lp --+--> data
357  *        |        |
358  *        \-- hp --/
359  */
360 #if defined(__ARM_NEON__)
361 #include <arm_neon.h>
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)362 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
363 		       float *dataL, float *dataR)
364 {
365 	float32x4_t x1 = { lp->x1L, hp->x1L, lp->x1R, hp->x1R };
366 	float32x4_t x2 = { lp->x2L, hp->x2L, lp->x2R, hp->x2R };
367 	float32x4_t y1 = { lp->y1L, hp->y1L, lp->y1R, hp->y1R };
368 	float32x4_t y2 = { lp->y2L, hp->y2L, lp->y2R, hp->y2R };
369 	float32x4_t z1 = { lp->z1L, hp->z1L, lp->z1R, hp->z1R };
370 	float32x4_t z2 = { lp->z2L, hp->z2L, lp->z2R, hp->z2R };
371 	float32x4_t b0 = { lp->b0, hp->b0, lp->b0, hp->b0 };
372 	float32x4_t b1 = { lp->b1, hp->b1, lp->b1, hp->b1 };
373 	float32x4_t b2 = { lp->b2, hp->b2, lp->b2, hp->b2 };
374 	float32x4_t a1 = { lp->a1, hp->a1, lp->a1, hp->a1 };
375 	float32x4_t a2 = { lp->a2, hp->a2, lp->a2, hp->a2 };
376 
377 	// clang-format off
378 	__asm__ __volatile__(
379 		/* q0 = x, q1 = y, q2 = z */
380 		"1:                                     \n"
381 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
382 		"vld1.32 d0[], [%[dataL]]               \n"
383 		"vld1.32 d1[], [%[dataR]]               \n"
384 		"subs %[count], #1                      \n"
385 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
386 		"vmla.f32 q1, %q[b0], q0                \n"
387 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
388 		"vmov.f32 %q[x2], %q[x1]                \n"
389 		"vmov.f32 %q[x1], q0                    \n"
390 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
391 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
392 		"vmla.f32 q2, %q[b0], q1                \n"
393 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
394 		"vmov.f32 %q[y2], %q[y1]                \n"
395 		"vmov.f32 %q[y1], q1                    \n"
396 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
397 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
398 		"vmov.f32 %q[z2], %q[z1]                \n"
399 		"vmov.f32 %q[z1], q2                    \n"
400 		"vpadd.f32 d4, d4, d5                   \n"
401 		"vst1.f32 d4[0], [%[dataL]]!            \n"
402 		"vst1.f32 d4[1], [%[dataR]]!            \n"
403 		"bne 1b                                 \n"
404 		: /* output */
405 		  "=r"(dataL),
406 		  "=r"(dataR),
407 		  "=r"(count),
408 		  [x1]"+w"(x1),
409 		  [x2]"+w"(x2),
410 		  [y1]"+w"(y1),
411 		  [y2]"+w"(y2),
412 		  [z1]"+w"(z1),
413 		  [z2]"+w"(z2)
414 		: /* input */
415 		  [dataL]"0"(dataL),
416 		  [dataR]"1"(dataR),
417 		  [count]"2"(count),
418 		  [b0]"w"(b0),
419 		  [b1]"w"(b1),
420 		  [b2]"w"(b2),
421 		  [a1]"w"(a1),
422 		  [a2]"w"(a2)
423 		: /* clobber */
424 		  "q0", "q1", "q2", "memory", "cc");
425 	// clang-format on
426 
427 	lp->x1L = x1[0];
428 	lp->x1R = x1[2];
429 	lp->x2L = x2[0];
430 	lp->x2R = x2[2];
431 	lp->y1L = y1[0];
432 	lp->y1R = y1[2];
433 	lp->y2L = y2[0];
434 	lp->y2R = y2[2];
435 	lp->z1L = z1[0];
436 	lp->z1R = z1[2];
437 	lp->z2L = z2[0];
438 	lp->z2R = z2[2];
439 
440 	hp->x1L = x1[1];
441 	hp->x1R = x1[3];
442 	hp->x2L = x2[1];
443 	hp->x2R = x2[3];
444 	hp->y1L = y1[1];
445 	hp->y1R = y1[3];
446 	hp->y2L = y2[1];
447 	hp->y2R = y2[3];
448 	hp->z1L = z1[1];
449 	hp->z1R = z1[3];
450 	hp->z2L = z2[1];
451 	hp->z2R = z2[3];
452 }
453 #elif defined(__SSE3__) && defined(__x86_64__)
454 #include <emmintrin.h>
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)455 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
456 		       float *dataL, float *dataR)
457 {
458 	__m128 x1 = { lp->x1L, hp->x1L, lp->x1R, hp->x1R };
459 	__m128 x2 = { lp->x2L, hp->x2L, lp->x2R, hp->x2R };
460 	__m128 y1 = { lp->y1L, hp->y1L, lp->y1R, hp->y1R };
461 	__m128 y2 = { lp->y2L, hp->y2L, lp->y2R, hp->y2R };
462 	__m128 z1 = { lp->z1L, hp->z1L, lp->z1R, hp->z1R };
463 	__m128 z2 = { lp->z2L, hp->z2L, lp->z2R, hp->z2R };
464 	__m128 b0 = { lp->b0, hp->b0, lp->b0, hp->b0 };
465 	__m128 b1 = { lp->b1, hp->b1, lp->b1, hp->b1 };
466 	__m128 b2 = { lp->b2, hp->b2, lp->b2, hp->b2 };
467 	__m128 a1 = { lp->a1, hp->a1, lp->a1, hp->a1 };
468 	__m128 a2 = { lp->a2, hp->a2, lp->a2, hp->a2 };
469 
470 	// clang-format off
471 	__asm__ __volatile__(
472 		"1:                                     \n"
473 		"movss (%[dataL]), %%xmm2               \n"
474 		"movss (%[dataR]), %%xmm1               \n"
475 		"shufps $0, %%xmm1, %%xmm2              \n"
476 		"mulps %[b2],%[x2]                      \n"
477 		"movaps %[b0], %%xmm0                   \n"
478 		"mulps %[a2],%[z2]                      \n"
479 		"movaps %[b1], %%xmm1                   \n"
480 		"mulps %%xmm2,%%xmm0                    \n"
481 		"mulps %[x1],%%xmm1                     \n"
482 		"addps %%xmm1,%%xmm0                    \n"
483 		"movaps %[a1],%%xmm1                    \n"
484 		"mulps %[y1],%%xmm1                     \n"
485 		"addps %[x2],%%xmm0                     \n"
486 		"movaps %[b1],%[x2]                     \n"
487 		"mulps %[y1],%[x2]                      \n"
488 		"subps %%xmm1,%%xmm0                    \n"
489 		"movaps %[a2],%%xmm1                    \n"
490 		"mulps %[y2],%%xmm1                     \n"
491 		"mulps %[b2],%[y2]                      \n"
492 		"subps %%xmm1,%%xmm0                    \n"
493 		"movaps %[b0],%%xmm1                    \n"
494 		"mulps %%xmm0,%%xmm1                    \n"
495 		"addps %[x2],%%xmm1                     \n"
496 		"movaps %[x1],%[x2]                     \n"
497 		"movaps %%xmm2,%[x1]                    \n"
498 		"addps %[y2],%%xmm1                     \n"
499 		"movaps %[a1],%[y2]                     \n"
500 		"mulps %[z1],%[y2]                      \n"
501 		"subps %[y2],%%xmm1                     \n"
502 		"movaps %[y1],%[y2]                     \n"
503 		"movaps %%xmm0,%[y1]                    \n"
504 		"subps %[z2],%%xmm1                     \n"
505 		"movaps %[z1],%[z2]                     \n"
506 		"movaps %%xmm1,%[z1]                    \n"
507 		"haddps %%xmm1, %%xmm1                  \n"
508 		"movss %%xmm1, (%[dataL])               \n"
509 		"shufps $0x39, %%xmm1, %%xmm1           \n"
510 		"movss %%xmm1, (%[dataR])               \n"
511 		"add $4, %[dataL]                       \n"
512 		"add $4, %[dataR]                       \n"
513 		"sub $1, %[count]                       \n"
514 		"jnz 1b                                 \n"
515 		: /* output */
516 		  [dataL]"+r"(dataL),
517 		  [dataR]"+r"(dataR),
518 		  [count]"+r"(count),
519 		  [x1]"+x"(x1),
520 		  [x2]"+x"(x2),
521 		  [y1]"+x"(y1),
522 		  [y2]"+x"(y2),
523 		  [z1]"+x"(z1),
524 		  [z2]"+x"(z2)
525 		: /* input */
526 		  [b0]"x"(b0),
527 		  [b1]"x"(b1),
528 		  [b2]"x"(b2),
529 		  [a1]"x"(a1),
530 		  [a2]"x"(a2)
531 		: /* clobber */
532 		  "xmm0", "xmm1", "xmm2", "memory", "cc");
533 	// clang-format on
534 
535 	lp->x1L = x1[0];
536 	lp->x1R = x1[2];
537 	lp->x2L = x2[0];
538 	lp->x2R = x2[2];
539 	lp->y1L = y1[0];
540 	lp->y1R = y1[2];
541 	lp->y2L = y2[0];
542 	lp->y2R = y2[2];
543 	lp->z1L = z1[0];
544 	lp->z1R = z1[2];
545 	lp->z2L = z2[0];
546 	lp->z2R = z2[2];
547 
548 	hp->x1L = x1[1];
549 	hp->x1R = x1[3];
550 	hp->x2L = x2[1];
551 	hp->x2R = x2[3];
552 	hp->y1L = y1[1];
553 	hp->y1R = y1[3];
554 	hp->y2L = y2[1];
555 	hp->y2R = y2[3];
556 	hp->z1L = z1[1];
557 	hp->z1R = z1[3];
558 	hp->z2L = z2[1];
559 	hp->z2R = z2[3];
560 }
561 #else
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)562 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
563 		       float *dataL, float *dataR)
564 {
565 	float lx1L = lp->x1L, lx1R = lp->x1R;
566 	float lx2L = lp->x2L, lx2R = lp->x2R;
567 	float ly1L = lp->y1L, ly1R = lp->y1R;
568 	float ly2L = lp->y2L, ly2R = lp->y2R;
569 	float lz1L = lp->z1L, lz1R = lp->z1R;
570 	float lz2L = lp->z2L, lz2R = lp->z2R;
571 	float lb0 = lp->b0;
572 	float lb1 = lp->b1;
573 	float lb2 = lp->b2;
574 	float la1 = lp->a1;
575 	float la2 = lp->a2;
576 
577 	float hx1L = hp->x1L, hx1R = hp->x1R;
578 	float hx2L = hp->x2L, hx2R = hp->x2R;
579 	float hy1L = hp->y1L, hy1R = hp->y1R;
580 	float hy2L = hp->y2L, hy2R = hp->y2R;
581 	float hz1L = hp->z1L, hz1R = hp->z1R;
582 	float hz2L = hp->z2L, hz2R = hp->z2R;
583 	float hb0 = hp->b0;
584 	float hb1 = hp->b1;
585 	float hb2 = hp->b2;
586 	float ha1 = hp->a1;
587 	float ha2 = hp->a2;
588 
589 	int i;
590 	for (i = 0; i < count; i++) {
591 		float xL, yL, zL, xR, yR, zR;
592 		xL = dataL[i];
593 		xR = dataR[i];
594 		yL = lb0 * xL + lb1 * lx1L + lb2 * lx2L - la1 * ly1L -
595 		     la2 * ly2L;
596 		yR = lb0 * xR + lb1 * lx1R + lb2 * lx2R - la1 * ly1R -
597 		     la2 * ly2R;
598 		zL = lb0 * yL + lb1 * ly1L + lb2 * ly2L - la1 * lz1L -
599 		     la2 * lz2L;
600 		zR = lb0 * yR + lb1 * ly1R + lb2 * ly2R - la1 * lz1R -
601 		     la2 * lz2R;
602 		lx2L = lx1L;
603 		lx2R = lx1R;
604 		lx1L = xL;
605 		lx1R = xR;
606 		ly2L = ly1L;
607 		ly2R = ly1R;
608 		ly1L = yL;
609 		ly1R = yR;
610 		lz2L = lz1L;
611 		lz2R = lz1R;
612 		lz1L = zL;
613 		lz1R = zR;
614 
615 		yL = hb0 * xL + hb1 * hx1L + hb2 * hx2L - ha1 * hy1L -
616 		     ha2 * hy2L;
617 		yR = hb0 * xR + hb1 * hx1R + hb2 * hx2R - ha1 * hy1R -
618 		     ha2 * hy2R;
619 		zL = hb0 * yL + hb1 * hy1L + hb2 * hy2L - ha1 * hz1L -
620 		     ha2 * hz2L;
621 		zR = hb0 * yR + hb1 * hy1R + hb2 * hy2R - ha1 * hz1R -
622 		     ha2 * hz2R;
623 		hx2L = hx1L;
624 		hx2R = hx1R;
625 		hx1L = xL;
626 		hx1R = xR;
627 		hy2L = hy1L;
628 		hy2R = hy1R;
629 		hy1L = yL;
630 		hy1R = yR;
631 		hz2L = hz1L;
632 		hz2R = hz1R;
633 		hz1L = zL;
634 		hz1R = zR;
635 		dataL[i] = zL + lz1L;
636 		dataR[i] = zR + lz1R;
637 	}
638 
639 	lp->x1L = lx1L;
640 	lp->x1R = lx1R;
641 	lp->x2L = lx2L;
642 	lp->x2R = lx2R;
643 	lp->y1L = ly1L;
644 	lp->y1R = ly1R;
645 	lp->y2L = ly2L;
646 	lp->y2R = ly2R;
647 	lp->z1L = lz1L;
648 	lp->z1R = lz1R;
649 	lp->z2L = lz2L;
650 	lp->z2R = lz2R;
651 
652 	hp->x1L = hx1L;
653 	hp->x1R = hx1R;
654 	hp->x2L = hx2L;
655 	hp->x2R = hx2R;
656 	hp->y1L = hy1L;
657 	hp->y1R = hy1R;
658 	hp->y2L = hy2L;
659 	hp->y2R = hy2R;
660 	hp->z1L = hz1L;
661 	hp->z1R = hz1R;
662 	hp->z2L = hz2L;
663 	hp->z2R = hz2R;
664 }
665 #endif
666 
crossover2_init(struct crossover2 * xo2,float freq1,float freq2)667 void crossover2_init(struct crossover2 *xo2, float freq1, float freq2)
668 {
669 	int i;
670 	for (i = 0; i < 3; i++) {
671 		float f = (i == 0) ? freq1 : freq2;
672 		lr42_set(&xo2->lp[i], BQ_LOWPASS, f);
673 		lr42_set(&xo2->hp[i], BQ_HIGHPASS, f);
674 	}
675 }
676 
crossover2_process(struct crossover2 * xo2,int count,float * data0L,float * data0R,float * data1L,float * data1R,float * data2L,float * data2R)677 void crossover2_process(struct crossover2 *xo2, int count, float *data0L,
678 			float *data0R, float *data1L, float *data1R,
679 			float *data2L, float *data2R)
680 {
681 	if (!count)
682 		return;
683 
684 	lr42_split(&xo2->lp[0], &xo2->hp[0], count, data0L, data0R, data1L,
685 		   data1R);
686 	lr42_merge(&xo2->lp[1], &xo2->hp[1], count, data0L, data0R);
687 	lr42_split(&xo2->lp[2], &xo2->hp[2], count, data1L, data1R, data2L,
688 		   data2R);
689 }
690