1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <string.h>
7 #include "crossover2.h"
8 #include "biquad.h"
9 
lr42_set(struct lr42 * lr42,enum biquad_type type,float freq)10 static void lr42_set(struct lr42 *lr42, enum biquad_type type, float freq)
11 {
12 	struct biquad q;
13 	biquad_set(&q, type, freq, 0, 0);
14 	memset(lr42, 0, sizeof(*lr42));
15 	lr42->b0 = q.b0;
16 	lr42->b1 = q.b1;
17 	lr42->b2 = q.b2;
18 	lr42->a1 = q.a1;
19 	lr42->a2 = q.a2;
20 }
21 
22 /* Split input data using two LR4 filters, put the result into the input array
23  * and another array.
24  *
25  * data0 --+-- lp --> data0
26  *         |
27  *         \-- hp --> data1
28  */
29 #if defined(__ARM_NEON__)
30 #include <arm_neon.h>
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)31 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
32 		       float *data0L, float *data0R,
33 		       float *data1L, float *data1R)
34 {
35 	float32x4_t x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
36 	float32x4_t x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
37 	float32x4_t y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
38 	float32x4_t y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
39 	float32x4_t z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
40 	float32x4_t z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
41 	float32x4_t b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
42 	float32x4_t b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
43 	float32x4_t b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
44 	float32x4_t a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
45 	float32x4_t a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
46 
47 	__asm__ __volatile__(
48 		/* q0 = x, q1 = y, q2 = z */
49 		"1:                                     \n"
50 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
51 		"vld1.32 d0[], [%[data0L]]              \n"
52 		"vld1.32 d1[], [%[data0R]]              \n"
53 		"subs %[count], #1                      \n"
54 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
55 		"vmla.f32 q1, %q[b0], q0                \n"
56 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
57 		"vmov.f32 %q[x2], %q[x1]                \n"
58 		"vmov.f32 %q[x1], q0                    \n"
59 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
60 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
61 		"vmla.f32 q2, %q[b0], q1                \n"
62 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
63 		"vmov.f32 %q[y2], %q[y1]                \n"
64 		"vmov.f32 %q[y1], q1                    \n"
65 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
66 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
67 		"vmov.f32 %q[z2], %q[z1]                \n"
68 		"vmov.f32 %q[z1], q2                    \n"
69 		"vst1.f32 d4[0], [%[data0L]]!           \n"
70 		"vst1.f32 d4[1], [%[data1L]]!           \n"
71 		"vst1.f32 d5[0], [%[data0R]]!           \n"
72 		"vst1.f32 d5[1], [%[data1R]]!           \n"
73 		"bne 1b                                 \n"
74 		: /* output */
75 		  "=r"(data0L),
76 		  "=r"(data0R),
77 		  "=r"(data1L),
78 		  "=r"(data1R),
79 		  "=r"(count),
80 		  [x1]"+w"(x1),
81 		  [x2]"+w"(x2),
82 		  [y1]"+w"(y1),
83 		  [y2]"+w"(y2),
84 		  [z1]"+w"(z1),
85 		  [z2]"+w"(z2)
86 		: /* input */
87 		  [data0L]"0"(data0L),
88 		  [data0R]"1"(data0R),
89 		  [data1L]"2"(data1L),
90 		  [data1R]"3"(data1R),
91 		  [count]"4"(count),
92 		  [b0]"w"(b0),
93 		  [b1]"w"(b1),
94 		  [b2]"w"(b2),
95 		  [a1]"w"(a1),
96 		  [a2]"w"(a2)
97 		: /* clobber */
98 		  "q0", "q1", "q2", "memory", "cc"
99 		);
100 
101 	lp->x1L = x1[0]; lp->x1R = x1[2];
102 	lp->x2L = x2[0]; lp->x2R = x2[2];
103 	lp->y1L = y1[0]; lp->y1R = y1[2];
104 	lp->y2L = y2[0]; lp->y2R = y2[2];
105 	lp->z1L = z1[0]; lp->z1R = z1[2];
106 	lp->z2L = z2[0]; lp->z2R = z2[2];
107 
108 	hp->x1L = x1[1]; hp->x1R = x1[3];
109 	hp->x2L = x2[1]; hp->x2R = x2[3];
110 	hp->y1L = y1[1]; hp->y1R = y1[3];
111 	hp->y2L = y2[1]; hp->y2R = y2[3];
112 	hp->z1L = z1[1]; hp->z1R = z1[3];
113 	hp->z2L = z2[1]; hp->z2R = z2[3];
114 }
115 #elif defined(__SSE3__) && defined(__x86_64__)
116 #include <emmintrin.h>
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)117 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
118 		       float *data0L, float *data0R,
119 		       float *data1L, float *data1R)
120 {
121 	__m128 x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
122 	__m128 x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
123 	__m128 y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
124 	__m128 y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
125 	__m128 z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
126 	__m128 z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
127 	__m128 b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
128 	__m128 b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
129 	__m128 b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
130 	__m128 a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
131 	__m128 a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
132 
133 	__asm__ __volatile__(
134 		"1:                                     \n"
135 		"movss (%[data0L]), %%xmm2              \n"
136 		"movss (%[data0R]), %%xmm1              \n"
137 		"shufps $0, %%xmm1, %%xmm2              \n"
138 		"mulps %[b2],%[x2]                      \n"
139 		"movaps %[b0], %%xmm0                   \n"
140 		"mulps %[a2],%[z2]                      \n"
141 		"movaps %[b1], %%xmm1                   \n"
142 		"mulps %%xmm2,%%xmm0                    \n"
143 		"mulps %[x1],%%xmm1                     \n"
144 		"addps %%xmm1,%%xmm0                    \n"
145 		"movaps %[a1],%%xmm1                    \n"
146 		"mulps %[y1],%%xmm1                     \n"
147 		"addps %[x2],%%xmm0                     \n"
148 		"movaps %[b1],%[x2]                     \n"
149 		"mulps %[y1],%[x2]                      \n"
150 		"subps %%xmm1,%%xmm0                    \n"
151 		"movaps %[a2],%%xmm1                    \n"
152 		"mulps %[y2],%%xmm1                     \n"
153 		"mulps %[b2],%[y2]                      \n"
154 		"subps %%xmm1,%%xmm0                    \n"
155 		"movaps %[b0],%%xmm1                    \n"
156 		"mulps %%xmm0,%%xmm1                    \n"
157 		"addps %[x2],%%xmm1                     \n"
158 		"movaps %[x1],%[x2]                     \n"
159 		"movaps %%xmm2,%[x1]                    \n"
160 		"addps %[y2],%%xmm1                     \n"
161 		"movaps %[a1],%[y2]                     \n"
162 		"mulps %[z1],%[y2]                      \n"
163 		"subps %[y2],%%xmm1                     \n"
164 		"movaps %[y1],%[y2]                     \n"
165 		"movaps %%xmm0,%[y1]                    \n"
166 		"subps %[z2],%%xmm1                     \n"
167 		"movaps %[z1],%[z2]                     \n"
168 		"movaps %%xmm1,%[z1]                    \n"
169 		"movss %%xmm1, (%[data0L])              \n"
170 		"shufps $0x39, %%xmm1, %%xmm1           \n"
171 		"movss %%xmm1, (%[data1L])              \n"
172 		"shufps $0x39, %%xmm1, %%xmm1           \n"
173 		"movss %%xmm1, (%[data0R])              \n"
174 		"shufps $0x39, %%xmm1, %%xmm1           \n"
175 		"movss %%xmm1, (%[data1R])              \n"
176 		"add $4, %[data0L]                      \n"
177 		"add $4, %[data1L]                      \n"
178 		"add $4, %[data0R]                      \n"
179 		"add $4, %[data1R]                      \n"
180 		"sub $1, %[count]                       \n"
181 		"jnz 1b                                 \n"
182 		: /* output */
183 		  [data0L]"+r"(data0L),
184 		  [data0R]"+r"(data0R),
185 		  [data1L]"+r"(data1L),
186 		  [data1R]"+r"(data1R),
187 		  [count]"+r"(count),
188 		  [x1]"+x"(x1),
189 		  [x2]"+x"(x2),
190 		  [y1]"+x"(y1),
191 		  [y2]"+x"(y2),
192 		  [z1]"+x"(z1),
193 		  [z2]"+x"(z2)
194 		: /* input */
195 		  [b0]"x"(b0),
196 		  [b1]"x"(b1),
197 		  [b2]"x"(b2),
198 		  [a1]"x"(a1),
199 		  [a2]"x"(a2)
200 		: /* clobber */
201 		  "xmm0", "xmm1", "xmm2", "memory", "cc"
202 		);
203 
204 	lp->x1L = x1[0]; lp->x1R = x1[2];
205 	lp->x2L = x2[0]; lp->x2R = x2[2];
206 	lp->y1L = y1[0]; lp->y1R = y1[2];
207 	lp->y2L = y2[0]; lp->y2R = y2[2];
208 	lp->z1L = z1[0]; lp->z1R = z1[2];
209 	lp->z2L = z2[0]; lp->z2R = z2[2];
210 
211 	hp->x1L = x1[1]; hp->x1R = x1[3];
212 	hp->x2L = x2[1]; hp->x2R = x2[3];
213 	hp->y1L = y1[1]; hp->y1R = y1[3];
214 	hp->y2L = y2[1]; hp->y2R = y2[3];
215 	hp->z1L = z1[1]; hp->z1R = z1[3];
216 	hp->z2L = z2[1]; hp->z2R = z2[3];
217 }
218 #else
lr42_split(struct lr42 * lp,struct lr42 * hp,int count,float * data0L,float * data0R,float * data1L,float * data1R)219 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
220 		       float *data0L, float *data0R,
221 		       float *data1L, float *data1R)
222 {
223 	float lx1L = lp->x1L, lx1R = lp->x1R;
224 	float lx2L = lp->x2L, lx2R = lp->x2R;
225 	float ly1L = lp->y1L, ly1R = lp->y1R;
226 	float ly2L = lp->y2L, ly2R = lp->y2R;
227 	float lz1L = lp->z1L, lz1R = lp->z1R;
228 	float lz2L = lp->z2L, lz2R = lp->z2R;
229 	float lb0 = lp->b0;
230 	float lb1 = lp->b1;
231 	float lb2 = lp->b2;
232 	float la1 = lp->a1;
233 	float la2 = lp->a2;
234 
235 	float hx1L = hp->x1L, hx1R = hp->x1R;
236 	float hx2L = hp->x2L, hx2R = hp->x2R;
237 	float hy1L = hp->y1L, hy1R = hp->y1R;
238 	float hy2L = hp->y2L, hy2R = hp->y2R;
239 	float hz1L = hp->z1L, hz1R = hp->z1R;
240 	float hz2L = hp->z2L, hz2R = hp->z2R;
241 	float hb0 = hp->b0;
242 	float hb1 = hp->b1;
243 	float hb2 = hp->b2;
244 	float ha1 = hp->a1;
245 	float ha2 = hp->a2;
246 
247 	int i;
248 	for (i = 0; i < count; i++) {
249 		float xL, yL, zL, xR, yR, zR;
250 		xL = data0L[i];
251 		xR = data0R[i];
252 		yL = lb0*xL + lb1*lx1L + lb2*lx2L - la1*ly1L - la2*ly2L;
253 		yR = lb0*xR + lb1*lx1R + lb2*lx2R - la1*ly1R - la2*ly2R;
254 		zL = lb0*yL + lb1*ly1L + lb2*ly2L - la1*lz1L - la2*lz2L;
255 		zR = lb0*yR + lb1*ly1R + lb2*ly2R - la1*lz1R - la2*lz2R;
256 		lx2L = lx1L;
257 		lx2R = lx1R;
258 		lx1L = xL;
259 		lx1R = xR;
260 		ly2L = ly1L;
261 		ly2R = ly1R;
262 		ly1L = yL;
263 		ly1R = yR;
264 		lz2L = lz1L;
265 		lz2R = lz1R;
266 		lz1L = zL;
267 		lz1R = zR;
268 		data0L[i] = zL;
269 		data0R[i] = zR;
270 
271 		yL = hb0*xL + hb1*hx1L + hb2*hx2L - ha1*hy1L - ha2*hy2L;
272 		yR = hb0*xR + hb1*hx1R + hb2*hx2R - ha1*hy1R - ha2*hy2R;
273 		zL = hb0*yL + hb1*hy1L + hb2*hy2L - ha1*hz1L - ha2*hz2L;
274 		zR = hb0*yR + hb1*hy1R + hb2*hy2R - ha1*hz1R - ha2*hz2R;
275 		hx2L = hx1L;
276 		hx2R = hx1R;
277 		hx1L = xL;
278 		hx1R = xR;
279 		hy2L = hy1L;
280 		hy2R = hy1R;
281 		hy1L = yL;
282 		hy1R = yR;
283 		hz2L = hz1L;
284 		hz2R = hz1R;
285 		hz1L = zL;
286 		hz1R = zR;
287 		data1L[i] = zL;
288 		data1R[i] = zR;
289 	}
290 
291 	lp->x1L = lx1L; lp->x1R = lx1R;
292 	lp->x2L = lx2L;	lp->x2R = lx2R;
293 	lp->y1L = ly1L;	lp->y1R = ly1R;
294 	lp->y2L = ly2L;	lp->y2R = ly2R;
295 	lp->z1L = lz1L;	lp->z1R = lz1R;
296 	lp->z2L = lz2L;	lp->z2R = lz2R;
297 
298 	hp->x1L = hx1L; hp->x1R = hx1R;
299 	hp->x2L = hx2L;	hp->x2R = hx2R;
300 	hp->y1L = hy1L;	hp->y1R = hy1R;
301 	hp->y2L = hy2L;	hp->y2R = hy2R;
302 	hp->z1L = hz1L;	hp->z1R = hz1R;
303 	hp->z2L = hz2L;	hp->z2R = hz2R;
304 }
305 #endif
306 
307 /* Split input data using two LR4 filters and sum them back to the original
308  * data array.
309  *
310  * data --+-- lp --+--> data
311  *        |        |
312  *        \-- hp --/
313  */
314 #if defined(__ARM_NEON__)
315 #include <arm_neon.h>
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)316 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
317 		       float *dataL, float *dataR)
318 {
319 	float32x4_t x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
320 	float32x4_t x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
321 	float32x4_t y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
322 	float32x4_t y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
323 	float32x4_t z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
324 	float32x4_t z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
325 	float32x4_t b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
326 	float32x4_t b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
327 	float32x4_t b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
328 	float32x4_t a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
329 	float32x4_t a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
330 
331 	__asm__ __volatile__(
332 		/* q0 = x, q1 = y, q2 = z */
333 		"1:                                     \n"
334 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
335 		"vld1.32 d0[], [%[dataL]]               \n"
336 		"vld1.32 d1[], [%[dataR]]               \n"
337 		"subs %[count], #1                      \n"
338 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
339 		"vmla.f32 q1, %q[b0], q0                \n"
340 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
341 		"vmov.f32 %q[x2], %q[x1]                \n"
342 		"vmov.f32 %q[x1], q0                    \n"
343 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
344 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
345 		"vmla.f32 q2, %q[b0], q1                \n"
346 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
347 		"vmov.f32 %q[y2], %q[y1]                \n"
348 		"vmov.f32 %q[y1], q1                    \n"
349 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
350 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
351 		"vmov.f32 %q[z2], %q[z1]                \n"
352 		"vmov.f32 %q[z1], q2                    \n"
353 		"vpadd.f32 d4, d4, d5                   \n"
354 		"vst1.f32 d4[0], [%[dataL]]!            \n"
355 		"vst1.f32 d4[1], [%[dataR]]!            \n"
356 		"bne 1b                                 \n"
357 		: /* output */
358 		  "=r"(dataL),
359 		  "=r"(dataR),
360 		  "=r"(count),
361 		  [x1]"+w"(x1),
362 		  [x2]"+w"(x2),
363 		  [y1]"+w"(y1),
364 		  [y2]"+w"(y2),
365 		  [z1]"+w"(z1),
366 		  [z2]"+w"(z2)
367 		: /* input */
368 		  [dataL]"0"(dataL),
369 		  [dataR]"1"(dataR),
370 		  [count]"2"(count),
371 		  [b0]"w"(b0),
372 		  [b1]"w"(b1),
373 		  [b2]"w"(b2),
374 		  [a1]"w"(a1),
375 		  [a2]"w"(a2)
376 		: /* clobber */
377 		  "q0", "q1", "q2", "memory", "cc"
378 		);
379 
380 	lp->x1L = x1[0]; lp->x1R = x1[2];
381 	lp->x2L = x2[0]; lp->x2R = x2[2];
382 	lp->y1L = y1[0]; lp->y1R = y1[2];
383 	lp->y2L = y2[0]; lp->y2R = y2[2];
384 	lp->z1L = z1[0]; lp->z1R = z1[2];
385 	lp->z2L = z2[0]; lp->z2R = z2[2];
386 
387 	hp->x1L = x1[1]; hp->x1R = x1[3];
388 	hp->x2L = x2[1]; hp->x2R = x2[3];
389 	hp->y1L = y1[1]; hp->y1R = y1[3];
390 	hp->y2L = y2[1]; hp->y2R = y2[3];
391 	hp->z1L = z1[1]; hp->z1R = z1[3];
392 	hp->z2L = z2[1]; hp->z2R = z2[3];
393 }
394 #elif defined(__SSE3__) && defined(__x86_64__)
395 #include <emmintrin.h>
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)396 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
397 		       float *dataL, float *dataR)
398 {
399 	__m128 x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
400 	__m128 x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
401 	__m128 y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
402 	__m128 y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
403 	__m128 z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
404 	__m128 z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
405 	__m128 b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
406 	__m128 b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
407 	__m128 b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
408 	__m128 a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
409 	__m128 a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
410 
411 	__asm__ __volatile__(
412 		"1:                                     \n"
413 		"movss (%[dataL]), %%xmm2               \n"
414 		"movss (%[dataR]), %%xmm1               \n"
415 		"shufps $0, %%xmm1, %%xmm2              \n"
416 		"mulps %[b2],%[x2]                      \n"
417 		"movaps %[b0], %%xmm0                   \n"
418 		"mulps %[a2],%[z2]                      \n"
419 		"movaps %[b1], %%xmm1                   \n"
420 		"mulps %%xmm2,%%xmm0                    \n"
421 		"mulps %[x1],%%xmm1                     \n"
422 		"addps %%xmm1,%%xmm0                    \n"
423 		"movaps %[a1],%%xmm1                    \n"
424 		"mulps %[y1],%%xmm1                     \n"
425 		"addps %[x2],%%xmm0                     \n"
426 		"movaps %[b1],%[x2]                     \n"
427 		"mulps %[y1],%[x2]                      \n"
428 		"subps %%xmm1,%%xmm0                    \n"
429 		"movaps %[a2],%%xmm1                    \n"
430 		"mulps %[y2],%%xmm1                     \n"
431 		"mulps %[b2],%[y2]                      \n"
432 		"subps %%xmm1,%%xmm0                    \n"
433 		"movaps %[b0],%%xmm1                    \n"
434 		"mulps %%xmm0,%%xmm1                    \n"
435 		"addps %[x2],%%xmm1                     \n"
436 		"movaps %[x1],%[x2]                     \n"
437 		"movaps %%xmm2,%[x1]                    \n"
438 		"addps %[y2],%%xmm1                     \n"
439 		"movaps %[a1],%[y2]                     \n"
440 		"mulps %[z1],%[y2]                      \n"
441 		"subps %[y2],%%xmm1                     \n"
442 		"movaps %[y1],%[y2]                     \n"
443 		"movaps %%xmm0,%[y1]                    \n"
444 		"subps %[z2],%%xmm1                     \n"
445 		"movaps %[z1],%[z2]                     \n"
446 		"movaps %%xmm1,%[z1]                    \n"
447 		"haddps %%xmm1, %%xmm1                  \n"
448 		"movss %%xmm1, (%[dataL])               \n"
449 		"shufps $0x39, %%xmm1, %%xmm1           \n"
450 		"movss %%xmm1, (%[dataR])               \n"
451 		"add $4, %[dataL]                       \n"
452 		"add $4, %[dataR]                       \n"
453 		"sub $1, %[count]                       \n"
454 		"jnz 1b                                 \n"
455 		: /* output */
456 		  [dataL]"+r"(dataL),
457 		  [dataR]"+r"(dataR),
458 		  [count]"+r"(count),
459 		  [x1]"+x"(x1),
460 		  [x2]"+x"(x2),
461 		  [y1]"+x"(y1),
462 		  [y2]"+x"(y2),
463 		  [z1]"+x"(z1),
464 		  [z2]"+x"(z2)
465 		: /* input */
466 		  [b0]"x"(b0),
467 		  [b1]"x"(b1),
468 		  [b2]"x"(b2),
469 		  [a1]"x"(a1),
470 		  [a2]"x"(a2)
471 		: /* clobber */
472 		  "xmm0", "xmm1", "xmm2", "memory", "cc"
473 		);
474 
475 	lp->x1L = x1[0]; lp->x1R = x1[2];
476 	lp->x2L = x2[0]; lp->x2R = x2[2];
477 	lp->y1L = y1[0]; lp->y1R = y1[2];
478 	lp->y2L = y2[0]; lp->y2R = y2[2];
479 	lp->z1L = z1[0]; lp->z1R = z1[2];
480 	lp->z2L = z2[0]; lp->z2R = z2[2];
481 
482 	hp->x1L = x1[1]; hp->x1R = x1[3];
483 	hp->x2L = x2[1]; hp->x2R = x2[3];
484 	hp->y1L = y1[1]; hp->y1R = y1[3];
485 	hp->y2L = y2[1]; hp->y2R = y2[3];
486 	hp->z1L = z1[1]; hp->z1R = z1[3];
487 	hp->z2L = z2[1]; hp->z2R = z2[3];
488 }
489 #else
lr42_merge(struct lr42 * lp,struct lr42 * hp,int count,float * dataL,float * dataR)490 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
491 		       float *dataL, float *dataR)
492 {
493 	float lx1L = lp->x1L, lx1R = lp->x1R;
494 	float lx2L = lp->x2L, lx2R = lp->x2R;
495 	float ly1L = lp->y1L, ly1R = lp->y1R;
496 	float ly2L = lp->y2L, ly2R = lp->y2R;
497 	float lz1L = lp->z1L, lz1R = lp->z1R;
498 	float lz2L = lp->z2L, lz2R = lp->z2R;
499 	float lb0 = lp->b0;
500 	float lb1 = lp->b1;
501 	float lb2 = lp->b2;
502 	float la1 = lp->a1;
503 	float la2 = lp->a2;
504 
505 	float hx1L = hp->x1L, hx1R = hp->x1R;
506 	float hx2L = hp->x2L, hx2R = hp->x2R;
507 	float hy1L = hp->y1L, hy1R = hp->y1R;
508 	float hy2L = hp->y2L, hy2R = hp->y2R;
509 	float hz1L = hp->z1L, hz1R = hp->z1R;
510 	float hz2L = hp->z2L, hz2R = hp->z2R;
511 	float hb0 = hp->b0;
512 	float hb1 = hp->b1;
513 	float hb2 = hp->b2;
514 	float ha1 = hp->a1;
515 	float ha2 = hp->a2;
516 
517 	int i;
518 	for (i = 0; i < count; i++) {
519 		float xL, yL, zL, xR, yR, zR;
520 		xL = dataL[i];
521 		xR = dataR[i];
522 		yL = lb0*xL + lb1*lx1L + lb2*lx2L - la1*ly1L - la2*ly2L;
523 		yR = lb0*xR + lb1*lx1R + lb2*lx2R - la1*ly1R - la2*ly2R;
524 		zL = lb0*yL + lb1*ly1L + lb2*ly2L - la1*lz1L - la2*lz2L;
525 		zR = lb0*yR + lb1*ly1R + lb2*ly2R - la1*lz1R - la2*lz2R;
526 		lx2L = lx1L;
527 		lx2R = lx1R;
528 		lx1L = xL;
529 		lx1R = xR;
530 		ly2L = ly1L;
531 		ly2R = ly1R;
532 		ly1L = yL;
533 		ly1R = yR;
534 		lz2L = lz1L;
535 		lz2R = lz1R;
536 		lz1L = zL;
537 		lz1R = zR;
538 
539 		yL = hb0*xL + hb1*hx1L + hb2*hx2L - ha1*hy1L - ha2*hy2L;
540 		yR = hb0*xR + hb1*hx1R + hb2*hx2R - ha1*hy1R - ha2*hy2R;
541 		zL = hb0*yL + hb1*hy1L + hb2*hy2L - ha1*hz1L - ha2*hz2L;
542 		zR = hb0*yR + hb1*hy1R + hb2*hy2R - ha1*hz1R - ha2*hz2R;
543 		hx2L = hx1L;
544 		hx2R = hx1R;
545 		hx1L = xL;
546 		hx1R = xR;
547 		hy2L = hy1L;
548 		hy2R = hy1R;
549 		hy1L = yL;
550 		hy1R = yR;
551 		hz2L = hz1L;
552 		hz2R = hz1R;
553 		hz1L = zL;
554 		hz1R = zR;
555 		dataL[i] = zL + lz1L;
556 		dataR[i] = zR + lz1R;
557 	}
558 
559 	lp->x1L = lx1L; lp->x1R = lx1R;
560 	lp->x2L = lx2L;	lp->x2R = lx2R;
561 	lp->y1L = ly1L;	lp->y1R = ly1R;
562 	lp->y2L = ly2L;	lp->y2R = ly2R;
563 	lp->z1L = lz1L;	lp->z1R = lz1R;
564 	lp->z2L = lz2L;	lp->z2R = lz2R;
565 
566 	hp->x1L = hx1L; hp->x1R = hx1R;
567 	hp->x2L = hx2L;	hp->x2R = hx2R;
568 	hp->y1L = hy1L;	hp->y1R = hy1R;
569 	hp->y2L = hy2L;	hp->y2R = hy2R;
570 	hp->z1L = hz1L;	hp->z1R = hz1R;
571 	hp->z2L = hz2L;	hp->z2R = hz2R;
572 }
573 #endif
574 
crossover2_init(struct crossover2 * xo2,float freq1,float freq2)575 void crossover2_init(struct crossover2 *xo2, float freq1, float freq2)
576 {
577 	int i;
578 	for (i = 0; i < 3; i++) {
579 		float f = (i == 0) ? freq1 : freq2;
580 		lr42_set(&xo2->lp[i], BQ_LOWPASS, f);
581 		lr42_set(&xo2->hp[i], BQ_HIGHPASS, f);
582 	}
583 }
584 
crossover2_process(struct crossover2 * xo2,int count,float * data0L,float * data0R,float * data1L,float * data1R,float * data2L,float * data2R)585 void crossover2_process(struct crossover2 *xo2, int count,
586 			float *data0L, float *data0R,
587 			float *data1L, float *data1R,
588 			float *data2L, float *data2R)
589 {
590 	if (!count)
591 		return;
592 
593 	lr42_split(&xo2->lp[0], &xo2->hp[0], count, data0L, data0R,
594 		   data1L, data1R);
595 	lr42_merge(&xo2->lp[1], &xo2->hp[1], count, data0L, data0R);
596 	lr42_split(&xo2->lp[2], &xo2->hp[2], count, data1L, data1R,
597 		   data2L, data2R);
598 }
599