1 /* Copyright (C) 2002 Jean-Marc Valin */
2 /**
3    @file filters_sse.h
4    @brief Various analysis/synthesis filters (SSE version)
5 */
6 /*
7    Redistribution and use in source and binary forms, with or without
8    modification, are permitted provided that the following conditions
9    are met:
10 
11    - Redistributions of source code must retain the above copyright
12    notice, this list of conditions and the following disclaimer.
13 
14    - Redistributions in binary form must reproduce the above copyright
15    notice, this list of conditions and the following disclaimer in the
16    documentation and/or other materials provided with the distribution.
17 
18    - Neither the name of the Xiph.org Foundation nor the names of its
19    contributors may be used to endorse or promote products derived from
20    this software without specific prior written permission.
21 
22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
26    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <xmmintrin.h>
36 
filter_mem16_10(const float * x,const float * _num,const float * _den,float * y,int N,int ord,float * _mem)37 void filter_mem16_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
38 {
39    __m128 num[3], den[3], mem[3];
40 
41    int i;
42 
43    /* Copy numerator, denominator and memory to aligned xmm */
44    for (i=0;i<2;i++)
45    {
46       mem[i] = _mm_loadu_ps(_mem+4*i);
47       num[i] = _mm_loadu_ps(_num+4*i);
48       den[i] = _mm_loadu_ps(_den+4*i);
49    }
50    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
51    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
52    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
53 
54    for (i=0;i<N;i++)
55    {
56       __m128 xx;
57       __m128 yy;
58       /* Compute next filter result */
59       xx = _mm_load_ps1(x+i);
60       yy = _mm_add_ss(xx, mem[0]);
61       _mm_store_ss(y+i, yy);
62       yy = _mm_shuffle_ps(yy, yy, 0);
63 
64       /* Update memory */
65       mem[0] = _mm_move_ss(mem[0], mem[1]);
66       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
67 
68       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
69       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
70 
71       mem[1] = _mm_move_ss(mem[1], mem[2]);
72       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
73 
74       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
75       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
76 
77       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
78 
79       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
80       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
81    }
82    /* Put memory back in its place */
83    _mm_storeu_ps(_mem, mem[0]);
84    _mm_storeu_ps(_mem+4, mem[1]);
85    _mm_store_ss(_mem+8, mem[2]);
86    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
87    _mm_store_ss(_mem+9, mem[2]);
88 }
89 
filter_mem16_8(const float * x,const float * _num,const float * _den,float * y,int N,int ord,float * _mem)90 void filter_mem16_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
91 {
92    __m128 num[2], den[2], mem[2];
93 
94    int i;
95 
96    /* Copy numerator, denominator and memory to aligned xmm */
97    for (i=0;i<2;i++)
98    {
99       mem[i] = _mm_loadu_ps(_mem+4*i);
100       num[i] = _mm_loadu_ps(_num+4*i);
101       den[i] = _mm_loadu_ps(_den+4*i);
102    }
103 
104    for (i=0;i<N;i++)
105    {
106       __m128 xx;
107       __m128 yy;
108       /* Compute next filter result */
109       xx = _mm_load_ps1(x+i);
110       yy = _mm_add_ss(xx, mem[0]);
111       _mm_store_ss(y+i, yy);
112       yy = _mm_shuffle_ps(yy, yy, 0);
113 
114       /* Update memory */
115       mem[0] = _mm_move_ss(mem[0], mem[1]);
116       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
117 
118       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
119       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
120 
121       mem[1] = _mm_sub_ss(mem[1], mem[1]);
122       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
123 
124       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
125       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
126    }
127    /* Put memory back in its place */
128    _mm_storeu_ps(_mem, mem[0]);
129    _mm_storeu_ps(_mem+4, mem[1]);
130 }
131 
132 
133 #define OVERRIDE_FILTER_MEM16
filter_mem16(const float * x,const float * _num,const float * _den,float * y,int N,int ord,float * _mem,char * stack)134 void filter_mem16(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
135 {
136    if(ord==10)
137       filter_mem16_10(x, _num, _den, y, N, ord, _mem);
138    else if (ord==8)
139       filter_mem16_8(x, _num, _den, y, N, ord, _mem);
140 }
141 
142 
143 
iir_mem16_10(const float * x,const float * _den,float * y,int N,int ord,float * _mem)144 void iir_mem16_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
145 {
146    __m128 den[3], mem[3];
147 
148    int i;
149 
150    /* Copy numerator, denominator and memory to aligned xmm */
151    for (i=0;i<2;i++)
152    {
153       mem[i] = _mm_loadu_ps(_mem+4*i);
154       den[i] = _mm_loadu_ps(_den+4*i);
155    }
156    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
157    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
158 
159    for (i=0;i<N;i++)
160    {
161       __m128 xx;
162       __m128 yy;
163       /* Compute next filter result */
164       xx = _mm_load_ps1(x+i);
165       yy = _mm_add_ss(xx, mem[0]);
166       _mm_store_ss(y+i, yy);
167       yy = _mm_shuffle_ps(yy, yy, 0);
168 
169       /* Update memory */
170       mem[0] = _mm_move_ss(mem[0], mem[1]);
171       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
172 
173       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
174 
175       mem[1] = _mm_move_ss(mem[1], mem[2]);
176       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
177 
178       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
179 
180       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
181 
182       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
183    }
184    /* Put memory back in its place */
185    _mm_storeu_ps(_mem, mem[0]);
186    _mm_storeu_ps(_mem+4, mem[1]);
187    _mm_store_ss(_mem+8, mem[2]);
188    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
189    _mm_store_ss(_mem+9, mem[2]);
190 }
191 
192 
iir_mem16_8(const float * x,const float * _den,float * y,int N,int ord,float * _mem)193 void iir_mem16_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
194 {
195    __m128 den[2], mem[2];
196 
197    int i;
198 
199    /* Copy numerator, denominator and memory to aligned xmm */
200    for (i=0;i<2;i++)
201    {
202       mem[i] = _mm_loadu_ps(_mem+4*i);
203       den[i] = _mm_loadu_ps(_den+4*i);
204    }
205 
206    for (i=0;i<N;i++)
207    {
208       __m128 xx;
209       __m128 yy;
210       /* Compute next filter result */
211       xx = _mm_load_ps1(x+i);
212       yy = _mm_add_ss(xx, mem[0]);
213       _mm_store_ss(y+i, yy);
214       yy = _mm_shuffle_ps(yy, yy, 0);
215 
216       /* Update memory */
217       mem[0] = _mm_move_ss(mem[0], mem[1]);
218       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
219 
220       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
221 
222       mem[1] = _mm_sub_ss(mem[1], mem[1]);
223       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
224 
225       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
226    }
227    /* Put memory back in its place */
228    _mm_storeu_ps(_mem, mem[0]);
229    _mm_storeu_ps(_mem+4, mem[1]);
230 }
231 
232 #define OVERRIDE_IIR_MEM16
iir_mem16(const float * x,const float * _den,float * y,int N,int ord,float * _mem,char * stack)233 void iir_mem16(const float *x, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
234 {
235    if(ord==10)
236       iir_mem16_10(x, _den, y, N, ord, _mem);
237    else if (ord==8)
238       iir_mem16_8(x, _den, y, N, ord, _mem);
239 }
240 
241 
fir_mem16_10(const float * x,const float * _num,float * y,int N,int ord,float * _mem)242 void fir_mem16_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
243 {
244    __m128 num[3], mem[3];
245 
246    int i;
247 
248    /* Copy numerator, denominator and memory to aligned xmm */
249    for (i=0;i<2;i++)
250    {
251       mem[i] = _mm_loadu_ps(_mem+4*i);
252       num[i] = _mm_loadu_ps(_num+4*i);
253    }
254    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
255    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
256 
257    for (i=0;i<N;i++)
258    {
259       __m128 xx;
260       __m128 yy;
261       /* Compute next filter result */
262       xx = _mm_load_ps1(x+i);
263       yy = _mm_add_ss(xx, mem[0]);
264       _mm_store_ss(y+i, yy);
265       yy = _mm_shuffle_ps(yy, yy, 0);
266 
267       /* Update memory */
268       mem[0] = _mm_move_ss(mem[0], mem[1]);
269       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
270 
271       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
272 
273       mem[1] = _mm_move_ss(mem[1], mem[2]);
274       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
275 
276       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
277 
278       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
279 
280       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
281    }
282    /* Put memory back in its place */
283    _mm_storeu_ps(_mem, mem[0]);
284    _mm_storeu_ps(_mem+4, mem[1]);
285    _mm_store_ss(_mem+8, mem[2]);
286    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
287    _mm_store_ss(_mem+9, mem[2]);
288 }
289 
fir_mem16_8(const float * x,const float * _num,float * y,int N,int ord,float * _mem)290 void fir_mem16_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
291 {
292    __m128 num[2], mem[2];
293 
294    int i;
295 
296    /* Copy numerator, denominator and memory to aligned xmm */
297    for (i=0;i<2;i++)
298    {
299       mem[i] = _mm_loadu_ps(_mem+4*i);
300       num[i] = _mm_loadu_ps(_num+4*i);
301    }
302 
303    for (i=0;i<N;i++)
304    {
305       __m128 xx;
306       __m128 yy;
307       /* Compute next filter result */
308       xx = _mm_load_ps1(x+i);
309       yy = _mm_add_ss(xx, mem[0]);
310       _mm_store_ss(y+i, yy);
311       yy = _mm_shuffle_ps(yy, yy, 0);
312 
313       /* Update memory */
314       mem[0] = _mm_move_ss(mem[0], mem[1]);
315       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
316 
317       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
318 
319       mem[1] = _mm_sub_ss(mem[1], mem[1]);
320       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
321 
322       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
323    }
324    /* Put memory back in its place */
325    _mm_storeu_ps(_mem, mem[0]);
326    _mm_storeu_ps(_mem+4, mem[1]);
327 }
328 
329 #define OVERRIDE_FIR_MEM16
fir_mem16(const float * x,const float * _num,float * y,int N,int ord,float * _mem,char * stack)330 void fir_mem16(const float *x, const float *_num, float *y, int N, int ord, float *_mem, char *stack)
331 {
332    if(ord==10)
333       fir_mem16_10(x, _num, y, N, ord, _mem);
334    else if (ord==8)
335       fir_mem16_8(x, _num, y, N, ord, _mem);
336 }
337