1 /*
2 This software is part of pffft/pfdsp, a set of simple DSP routines.
3
4 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
5 Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de>
6 All rights reserved.
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 * Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 * Neither the name of the copyright holder nor the
16 names of its contributors may be used to endorse or promote products
17 derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /* include own header first, to see missing includes */
32 #include "pf_mixer.h"
33 #include "fmv.h"
34
35 #include <math.h>
36 #include <stdlib.h>
37 #include <assert.h>
38
39 //they dropped M_PI in C99, so we define it:
40 #define PI ((float)3.14159265358979323846)
41
42 //apply to pointers:
43 #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
44 #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
45
46 #define USE_ALIGNED_ADDRESSES 0
47
48
49
50 /*
51 _____ _____ _____ __ _ _
52 | __ \ / ____| __ \ / _| | | (_)
53 | | | | (___ | |__) | | |_ _ _ _ __ ___| |_ _ ___ _ __ ___
54 | | | |\___ \| ___/ | _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
55 | |__| |____) | | | | | |_| | | | | (__| |_| | (_) | | | \__ \
56 |_____/|_____/|_| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
57
58 */
59
60
61 #if defined(__GNUC__)
62 # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
63 # define RESTRICT __restrict
64 #elif defined(_MSC_VER)
65 # define ALWAYS_INLINE(return_type) __forceinline return_type
66 # define RESTRICT __restrict
67 #endif
68
69
70 #ifndef PFFFT_SIMD_DISABLE
71 #if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
72 #pragma message "Manual SSE x86/x64 optimizations are ON"
73 #include <xmmintrin.h>
74 #define HAVE_SSE_INTRINSICS 1
75
76 #elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
77 #pragma message "Manual NEON (arm32) optimizations are ON"
78 #include "sse2neon.h"
79 #define HAVE_SSE_INTRINSICS 1
80
81 #elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
82 #pragma message "Manual NEON (aarch64) optimizations are ON"
83 #include "sse2neon.h"
84 #define HAVE_SSE_INTRINSICS 1
85
86 #endif
87 #endif
88
89 #ifdef HAVE_SSE_INTRINSICS
90
91 typedef __m128 v4sf;
92 # define SIMD_SZ 4
93
94 typedef union v4_union {
95 __m128 v;
96 float f[4];
97 } v4_union;
98
99 #define VMUL(a,b) _mm_mul_ps(a,b)
100 #define VDIV(a,b) _mm_div_ps(a,b)
101 #define VADD(a,b) _mm_add_ps(a,b)
102 #define VSUB(a,b) _mm_sub_ps(a,b)
103 #define LD_PS1(s) _mm_set1_ps(s)
104 #define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr))
105 #define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr))
106 #define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v)
107 #define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v)
108 #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
109 #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
110
111 #if USE_ALIGNED_ADDRESSES
112 #define VLOAD(ptr) _mm_load_ps((const float *)(ptr))
113 #define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v)
114 #else
115 #define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr))
116 #define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v)
117 #endif
118
119
have_sse_shift_mixer_impl()120 int have_sse_shift_mixer_impl()
121 {
122 return 1;
123 }
124
125 #else
126
have_sse_shift_mixer_impl()127 int have_sse_shift_mixer_impl()
128 {
129 return 0;
130 }
131
132 #endif
133
134
135 /*********************************************************************/
136
137 /**************/
138 /*** ALGO A ***/
139 /**************/
140
141 PF_TARGET_CLONES
shift_math_cc(complexf * input,complexf * output,int input_size,float rate,float starting_phase)142 float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
143 {
144 rate*=2;
145 //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
146 float phase=starting_phase;
147 float phase_increment=rate*PI;
148 float cosval, sinval;
149 for(int i=0;i<input_size; i++)
150 {
151 cosval=cos(phase);
152 sinval=sin(phase);
153 //we multiply two complex numbers.
154 //how? enter this to maxima (software) for explanation:
155 // (a+b*%i)*(c+d*%i), rectform;
156 iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
157 qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
158 phase+=phase_increment;
159 while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
160 while(phase<0) phase+=2*PI;
161 }
162 return phase;
163 }
164
165 /*********************************************************************/
166
167 /**************/
168 /*** ALGO B ***/
169 /**************/
170
shift_table_init(int table_size)171 shift_table_data_t shift_table_init(int table_size)
172 {
173 shift_table_data_t output;
174 output.table=(float*)malloc(sizeof(float)*table_size);
175 output.table_size=table_size;
176 for(int i=0;i<table_size;i++)
177 {
178 output.table[i]=sin(((float)i/table_size)*(PI/2));
179 }
180 return output;
181 }
182
shift_table_deinit(shift_table_data_t table_data)183 void shift_table_deinit(shift_table_data_t table_data)
184 {
185 free(table_data.table);
186 }
187
188
189 PF_TARGET_CLONES
shift_table_cc(complexf * input,complexf * output,int input_size,float rate,shift_table_data_t table_data,float starting_phase)190 float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
191 {
192 rate*=2;
193 //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
194 float phase=starting_phase;
195 float phase_increment=rate*PI;
196 float cosval, sinval;
197 for(int i=0;i<input_size; i++) //@shift_math_cc
198 {
199 int sin_index, cos_index, temp_index, sin_sign, cos_sign;
200 int quadrant=phase/(PI/2); //between 0 and 3
201 float vphase=phase-quadrant*(PI/2);
202 sin_index=(vphase/(PI/2))*table_data.table_size;
203 cos_index=table_data.table_size-1-sin_index;
204 if(quadrant&1) //in quadrant 1 and 3
205 {
206 temp_index=sin_index;
207 sin_index=cos_index;
208 cos_index=temp_index;
209 }
210 sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
211 cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
212 sinval=sin_sign*table_data.table[sin_index];
213 cosval=cos_sign*table_data.table[cos_index];
214 //we multiply two complex numbers.
215 //how? enter this to maxima (software) for explanation:
216 // (a+b*%i)*(c+d*%i), rectform;
217 iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
218 qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
219 phase+=phase_increment;
220 while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
221 while(phase<0) phase+=2*PI;
222 }
223 return phase;
224 }
225
226 /*********************************************************************/
227
228 /**************/
229 /*** ALGO C ***/
230 /**************/
231
shift_addfast_init(float rate)232 shift_addfast_data_t shift_addfast_init(float rate)
233 {
234 shift_addfast_data_t output;
235 output.phase_increment=2*rate*PI;
236 for(int i=0;i<4;i++)
237 {
238 output.dsin[i]=sin(output.phase_increment*(i+1));
239 output.dcos[i]=cos(output.phase_increment*(i+1));
240 }
241 return output;
242 }
243
244 #define SADF_L1(j) \
245 cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
246 sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
247 #define SADF_L2(j) \
248 iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
249 qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
250
251 PF_TARGET_CLONES
shift_addfast_cc(complexf * input,complexf * output,int input_size,shift_addfast_data_t * d,float starting_phase)252 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
253 {
254 //input_size should be multiple of 4
255 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
256 float cos_start=cos(starting_phase);
257 float sin_start=sin(starting_phase);
258 float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
259 sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
260 dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
261 dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
262
263 for(int i=0;i<input_size/4; i++)
264 {
265 SADF_L1(0)
266 SADF_L1(1)
267 SADF_L1(2)
268 SADF_L1(3)
269 SADF_L2(0)
270 SADF_L2(1)
271 SADF_L2(2)
272 SADF_L2(3)
273 cos_start = cos_vals_3;
274 sin_start = sin_vals_3;
275 }
276 starting_phase+=input_size*d->phase_increment;
277 while(starting_phase>PI) starting_phase-=2*PI;
278 while(starting_phase<-PI) starting_phase+=2*PI;
279 return starting_phase;
280 }
281
282 #undef SADF_L2
283
284
285 #define SADF_L2(j) \
286 tmp_inp_cos = iof(in_out,4*i+j); \
287 tmp_inp_sin = qof(in_out,4*i+j); \
288 iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
289 qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
290
291 PF_TARGET_CLONES
shift_addfast_inp_c(complexf * in_out,int N_cplx,shift_addfast_data_t * d,float starting_phase)292 float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
293 {
294 //input_size should be multiple of 4
295 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
296 float cos_start=cos(starting_phase);
297 float sin_start=sin(starting_phase);
298 float register tmp_inp_cos, tmp_inp_sin,
299 cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
300 sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
301 dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
302 dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
303
304 for(int i=0;i<N_cplx/4; i++)
305 {
306 SADF_L1(0)
307 SADF_L1(1)
308 SADF_L1(2)
309 SADF_L1(3)
310 SADF_L2(0)
311 SADF_L2(1)
312 SADF_L2(2)
313 SADF_L2(3)
314 cos_start = cos_vals_3;
315 sin_start = sin_vals_3;
316 }
317 starting_phase+=N_cplx*d->phase_increment;
318 while(starting_phase>PI) starting_phase-=2*PI;
319 while(starting_phase<-PI) starting_phase+=2*PI;
320 return starting_phase;
321 }
322
323 #undef SADF_L1
324 #undef SADF_L2
325
326
327 /*********************************************************************/
328
329 /**************/
330 /*** ALGO D ***/
331 /**************/
332
shift_unroll_init(float rate,int size)333 shift_unroll_data_t shift_unroll_init(float rate, int size)
334 {
335 shift_unroll_data_t output;
336 output.phase_increment=2*rate*PI;
337 output.size = size;
338 output.dsin=(float*)malloc(sizeof(float)*size);
339 output.dcos=(float*)malloc(sizeof(float)*size);
340 float myphase = 0;
341 for(int i=0;i<size;i++)
342 {
343 myphase += output.phase_increment;
344 while(myphase>PI) myphase-=2*PI;
345 while(myphase<-PI) myphase+=2*PI;
346 output.dsin[i]=sin(myphase);
347 output.dcos[i]=cos(myphase);
348 }
349 return output;
350 }
351
shift_unroll_deinit(shift_unroll_data_t * d)352 void shift_unroll_deinit(shift_unroll_data_t* d)
353 {
354 if (!d)
355 return;
356 free(d->dsin);
357 free(d->dcos);
358 d->dsin = NULL;
359 d->dcos = NULL;
360 }
361
362 PF_TARGET_CLONES
shift_unroll_cc(complexf * input,complexf * output,int input_size,shift_unroll_data_t * d,float starting_phase)363 float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
364 {
365 //input_size should be multiple of 4
366 //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
367 float cos_start = cos(starting_phase);
368 float sin_start = sin(starting_phase);
369 register float cos_val = cos_start, sin_val = sin_start;
370 for(int i=0;i<input_size; i++)
371 {
372 iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
373 qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
374 // calculate complex phasor for next iteration
375 cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
376 sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
377 }
378 starting_phase+=input_size*d->phase_increment;
379 while(starting_phase>PI) starting_phase-=2*PI;
380 while(starting_phase<-PI) starting_phase+=2*PI;
381 return starting_phase;
382 }
383
384 PF_TARGET_CLONES
shift_unroll_inp_c(complexf * in_out,int size,shift_unroll_data_t * d,float starting_phase)385 float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
386 {
387 float cos_start = cos(starting_phase);
388 float sin_start = sin(starting_phase);
389 register float cos_val = cos_start, sin_val = sin_start;
390 for(int i=0;i<size; i++)
391 {
392 register float inp_i = iof(in_out,i);
393 register float inp_q = qof(in_out,i);
394 iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
395 qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
396 // calculate complex phasor for next iteration
397 cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
398 sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
399 }
400 starting_phase += size * d->phase_increment;
401 while(starting_phase>PI) starting_phase-=2*PI;
402 while(starting_phase<-PI) starting_phase+=2*PI;
403 return starting_phase;
404 }
405
406
407 /*********************************************************************/
408
409 /**************/
410 /*** ALGO E ***/
411 /**************/
412
shift_limited_unroll_init(float rate)413 shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
414 {
415 shift_limited_unroll_data_t output;
416 output.phase_increment=2*rate*PI;
417 float myphase = 0;
418 for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
419 {
420 myphase += output.phase_increment;
421 while(myphase>PI) myphase-=2*PI;
422 while(myphase<-PI) myphase+=2*PI;
423 output.dcos[i] = cos(myphase);
424 output.dsin[i] = sin(myphase);
425 }
426 output.complex_phase.i = 1.0F;
427 output.complex_phase.q = 0.0F;
428 return output;
429 }
430
431 PF_TARGET_CLONES
shift_limited_unroll_cc(const complexf * input,complexf * output,int size,shift_limited_unroll_data_t * d)432 void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
433 {
434 float cos_start = d->complex_phase.i;
435 float sin_start = d->complex_phase.q;
436 register float cos_val = cos_start, sin_val = sin_start, mag;
437 while (size > 0)
438 {
439 int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
440 for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
441 {
442 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
443 {
444 iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
445 qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
446 // calculate complex phasor for next iteration
447 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
448 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
449 }
450 }
451 // "starts := vals := vals / |vals|"
452 mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
453 cos_val /= mag;
454 sin_val /= mag;
455 cos_start = cos_val;
456 sin_start = sin_val;
457
458 input += PF_SHIFT_LIMITED_UNROLL_SIZE;
459 output += PF_SHIFT_LIMITED_UNROLL_SIZE;
460 size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
461 }
462 d->complex_phase.i = cos_val;
463 d->complex_phase.q = sin_val;
464 }
465
466 PF_TARGET_CLONES
shift_limited_unroll_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_data_t * d)467 void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
468 {
469 float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
470 float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
471 // "vals := starts := phase_state"
472 float cos_start = d->complex_phase.i;
473 float sin_start = d->complex_phase.q;
474 register float cos_val = cos_start, sin_val = sin_start, mag;
475 while (N_cplx)
476 {
477 int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
478 for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
479 {
480 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
481 inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
482 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
483 inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
484 for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
485 {
486 // "out[] = inp[] * vals"
487 iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
488 qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
489 // calculate complex phasor for next iteration
490 // "vals := d[] * starts"
491 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
492 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
493 }
494 }
495 // "starts := vals := vals / |vals|"
496 mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
497 cos_val /= mag;
498 sin_val /= mag;
499 cos_start = cos_val;
500 sin_start = sin_val;
501
502 in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
503 N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
504 }
505 // "phase_state := starts"
506 d->complex_phase.i = cos_start;
507 d->complex_phase.q = sin_start;
508 }
509
510
511 #ifdef HAVE_SSE_INTRINSICS
512
513 /*********************************************************************/
514
515 /**************/
516 /*** ALGO F ***/
517 /**************/
518
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)519 shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
520 {
521 shift_limited_unroll_A_sse_data_t output;
522 float myphase;
523
524 output.phase_increment = 2*relative_freq*PI;
525
526 myphase = 0.0F;
527 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
528 {
529 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
530 {
531 myphase += output.phase_increment;
532 while(myphase>PI) myphase-=2*PI;
533 while(myphase<-PI) myphase+=2*PI;
534 }
535 output.dcos[i] = cos(myphase);
536 output.dsin[i] = sin(myphase);
537 for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
538 {
539 output.dcos[i+k] = output.dcos[i];
540 output.dsin[i+k] = output.dsin[i];
541 }
542 }
543
544 output.dcos_blk = 0.0F;
545 output.dsin_blk = 0.0F;
546
547 myphase = phase_start_rad;
548 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
549 {
550 output.phase_state_i[i] = cos(myphase);
551 output.phase_state_q[i] = sin(myphase);
552 myphase += output.phase_increment;
553 while(myphase>PI) myphase-=2*PI;
554 while(myphase<-PI) myphase+=2*PI;
555 }
556 return output;
557 }
558
559
560 PF_TARGET_CLONES
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)561 void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
562 {
563 // "vals := starts := phase_state"
564 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
565 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
566 __m128 cos_vals = cos_starts;
567 __m128 sin_vals = sin_starts;
568 __m128 inp_re, inp_im;
569 __m128 product_re, product_im;
570 __m128 interl_prod_a, interl_prod_b;
571 __m128 * RESTRICT p_trig_cos_tab;
572 __m128 * RESTRICT p_trig_sin_tab;
573 __m128 * RESTRICT u = (__m128*)in_out;
574
575 while (N_cplx)
576 {
577 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
578 int B = NB;
579 p_trig_cos_tab = (__m128*)( &d->dcos[0] );
580 p_trig_sin_tab = (__m128*)( &d->dsin[0] );
581 while (B)
582 {
583 // complex multiplication of 4 complex values from/to in_out[]
584 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
585 // "out[] = inp[] * vals"
586 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
587 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
588 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
589 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
590 VSTORE(u, interl_prod_a);
591 VSTORE(u+1, interl_prod_b);
592 u += 2;
593 // calculate complex phasor for next iteration
594 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
595 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
596 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
597 // "vals := d[] * starts"
598 inp_re = VLOAD(p_trig_cos_tab);
599 inp_im = VLOAD(p_trig_sin_tab);
600 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
601 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
602 ++p_trig_cos_tab;
603 ++p_trig_sin_tab;
604 B -= 4;
605 }
606 N_cplx -= NB;
607 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
608 /* re-use product_re[]/product_im[] for normalization */
609 // "starts := vals := vals / |vals|"
610 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
611 #if 0
612 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
613 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
614 product_im = _mm_rsqrt_ps(product_re);
615 cos_starts = cos_vals = VMUL(cos_vals, product_im);
616 sin_starts = sin_vals = VMUL(sin_vals, product_im);
617 #else
618 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
619 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
620 product_im = _mm_sqrt_ps(product_re);
621 cos_starts = cos_vals = VDIV(cos_vals, product_im);
622 sin_starts = sin_vals = VDIV(sin_vals, product_im);
623 #endif
624 }
625 // "phase_state := starts"
626 VSTORE( &d->phase_state_i[0], cos_starts );
627 VSTORE( &d->phase_state_q[0], sin_starts );
628 }
629
630
631 /*********************************************************************/
632
633 /**************/
634 /*** ALGO G ***/
635 /**************/
636
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)637 shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
638 {
639 shift_limited_unroll_B_sse_data_t output;
640 float myphase;
641
642 output.phase_increment = 2*relative_freq*PI;
643
644 myphase = 0.0F;
645 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
646 {
647 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
648 {
649 myphase += output.phase_increment;
650 while(myphase>PI) myphase-=2*PI;
651 while(myphase<-PI) myphase+=2*PI;
652 }
653 output.dtrig[i+0] = cos(myphase);
654 output.dtrig[i+1] = sin(myphase);
655 output.dtrig[i+2] = output.dtrig[i+0];
656 output.dtrig[i+3] = output.dtrig[i+1];
657 }
658
659 output.dcos_blk = 0.0F;
660 output.dsin_blk = 0.0F;
661
662 myphase = phase_start_rad;
663 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
664 {
665 output.phase_state_i[i] = cos(myphase);
666 output.phase_state_q[i] = sin(myphase);
667 myphase += output.phase_increment;
668 while(myphase>PI) myphase-=2*PI;
669 while(myphase<-PI) myphase+=2*PI;
670 }
671 return output;
672 }
673
674
675 PF_TARGET_CLONES
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)676 void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
677 {
678 // "vals := starts := phase_state"
679 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
680 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
681 __m128 cos_vals = cos_starts;
682 __m128 sin_vals = sin_starts;
683 __m128 inp_re, inp_im;
684 __m128 product_re, product_im;
685 __m128 interl_prod_a, interl_prod_b;
686 __m128 * RESTRICT p_trig_tab;
687 __m128 * RESTRICT u = (__m128*)in_out;
688
689 while (N_cplx)
690 {
691 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
692 int B = NB;
693 p_trig_tab = (__m128*)( &d->dtrig[0] );
694 while (B)
695 {
696 // complex multiplication of 4 complex values from/to in_out[]
697 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
698 // "out[] = inp[] * vals"
699 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
700 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
701 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
702 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
703 VSTORE(u, interl_prod_a);
704 VSTORE(u+1, interl_prod_b);
705 u += 2;
706 // calculate complex phasor for next iteration
707 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
708 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
709 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
710 // "vals := d[] * starts"
711 product_re = VLOAD(p_trig_tab);
712 UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
713 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
714 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
715 ++p_trig_tab;
716 B -= 4;
717 }
718 N_cplx -= NB;
719 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
720 /* re-use product_re[]/product_im[] for normalization */
721 // "starts := vals := vals / |vals|"
722 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
723 #if 0
724 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
725 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
726 product_im = _mm_rsqrt_ps(product_re);
727 cos_starts = cos_vals = VMUL(cos_vals, product_im);
728 sin_starts = sin_vals = VMUL(sin_vals, product_im);
729 #else
730 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
731 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
732 product_im = _mm_sqrt_ps(product_re);
733 cos_starts = cos_vals = VDIV(cos_vals, product_im);
734 sin_starts = sin_vals = VDIV(sin_vals, product_im);
735 #endif
736 }
737 // "phase_state := starts"
738 VSTORE( &d->phase_state_i[0], cos_starts );
739 VSTORE( &d->phase_state_q[0], sin_starts );
740 }
741
742
743 /*********************************************************************/
744
745
746 /**************/
747 /*** ALGO H ***/
748 /**************/
749
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)750 shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
751 {
752 shift_limited_unroll_C_sse_data_t output;
753 float myphase;
754
755 output.phase_increment = 2*relative_freq*PI;
756
757 myphase = 0.0F;
758 for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
759 {
760 for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
761 {
762 myphase += output.phase_increment;
763 while(myphase>PI) myphase-=2*PI;
764 while(myphase<-PI) myphase+=2*PI;
765 }
766 output.dinterl_trig[2*i] = cos(myphase);
767 output.dinterl_trig[2*i+4] = sin(myphase);
768 for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
769 {
770 output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
771 output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
772 }
773 }
774
775 output.dcos_blk = 0.0F;
776 output.dsin_blk = 0.0F;
777
778 myphase = phase_start_rad;
779 for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
780 {
781 output.phase_state_i[i] = cos(myphase);
782 output.phase_state_q[i] = sin(myphase);
783 myphase += output.phase_increment;
784 while(myphase>PI) myphase-=2*PI;
785 while(myphase<-PI) myphase+=2*PI;
786 }
787 return output;
788 }
789
790
791 PF_TARGET_CLONES
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)792 void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
793 {
794 // "vals := starts := phase_state"
795 __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
796 __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
797 __m128 cos_vals = cos_starts;
798 __m128 sin_vals = sin_starts;
799 __m128 inp_re, inp_im;
800 __m128 product_re, product_im;
801 __m128 interl_prod_a, interl_prod_b;
802 __m128 * RESTRICT p_trig_tab;
803 __m128 * RESTRICT u = (__m128*)in_out;
804
805 while (N_cplx)
806 {
807 const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
808 int B = NB;
809 p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
810 while (B)
811 {
812 // complex multiplication of 4 complex values from/to in_out[]
813 // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
814 // "out[] = inp[] * vals"
815 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
816 product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
817 product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
818 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
819 VSTORE(u, interl_prod_a);
820 VSTORE(u+1, interl_prod_b);
821 u += 2;
822 // calculate complex phasor for next iteration
823 // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
824 // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
825 // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
826 // "vals := d[] * starts"
827 inp_re = VLOAD(p_trig_tab);
828 inp_im = VLOAD(p_trig_tab+1);
829 cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
830 sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
831 p_trig_tab += 2;
832 B -= 4;
833 }
834 N_cplx -= NB;
835 /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
836 /* re-use product_re[]/product_im[] for normalization */
837 // "starts := vals := vals / |vals|"
838 product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
839 #if 0
840 // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
841 // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
842 product_im = _mm_rsqrt_ps(product_re);
843 cos_starts = cos_vals = VMUL(cos_vals, product_im);
844 sin_starts = sin_vals = VMUL(sin_vals, product_im);
845 #else
846 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
847 // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
848 product_im = _mm_sqrt_ps(product_re);
849 cos_starts = cos_vals = VDIV(cos_vals, product_im);
850 sin_starts = sin_vals = VDIV(sin_vals, product_im);
851 #endif
852 }
853 // "phase_state := starts"
854 VSTORE( &d->phase_state_i[0], cos_starts );
855 VSTORE( &d->phase_state_q[0], sin_starts );
856 }
857
858
859 #else
860
861 /*********************************************************************/
862
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)863 shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
864 assert(0);
865 shift_limited_unroll_A_sse_data_t r;
866 return r;
867 }
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)868 shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
869 assert(0);
870 shift_limited_unroll_B_sse_data_t r;
871 return r;
872 }
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)873 shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
874 assert(0);
875 shift_limited_unroll_C_sse_data_t r;
876 return r;
877 }
878
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)879 void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
880 assert(0);
881 }
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)882 void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
883 assert(0);
884 }
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)885 void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
886 assert(0);
887 }
888
889 #endif
890
891
892 /*********************************************************************/
893
894 /**************/
895 /*** ALGO I ***/
896 /**************/
897
shift_recursive_osc_update_rate(float rate,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)898 void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
899 {
900 // constants for single phase step
901 float phase_increment_s = rate*PI;
902 float k1 = tan(0.5*phase_increment_s);
903 float k2 = 2*k1 /(1 + k1 * k1);
904 for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
905 {
906 float tmp;
907 state->u_cos[j] = state->u_cos[j-1];
908 state->v_sin[j] = state->v_sin[j-1];
909 // small steps
910 tmp = state->u_cos[j] - k1 * state->v_sin[j];
911 state->v_sin[j] += k2 * tmp;
912 state->u_cos[j] = tmp - k1 * state->v_sin[j];
913 }
914
915 // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
916 float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
917 while(phase_increment_b > PI) phase_increment_b-=2*PI;
918 while(phase_increment_b < -PI) phase_increment_b+=2*PI;
919 conf->k1 = tan(0.5*phase_increment_b);
920 conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
921 }
922
shift_recursive_osc_init(float rate,float starting_phase,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)923 void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
924 {
925 if (starting_phase != 0.0F)
926 {
927 state->u_cos[0] = cos(starting_phase);
928 state->v_sin[0] = sin(starting_phase);
929 }
930 else
931 {
932 state->u_cos[0] = 1.0F;
933 state->v_sin[0] = 0.0F;
934 }
935 shift_recursive_osc_update_rate(rate, conf, state);
936 }
937
938
939 PF_TARGET_CLONES
shift_recursive_osc_cc(const complexf * input,complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)940 void shift_recursive_osc_cc(const complexf *input, complexf* output,
941 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
942 {
943 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
944 float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
945 float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
946 shift_recursive_osc_t state = *state_ext;
947 const float k1 = conf->k1;
948 const float k2 = conf->k2;
949 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
950 {
951 //we multiply two complex numbers - similar to shift_math_cc
952 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
953 {
954 inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
955 inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
956 }
957 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
958 {
959 iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
960 qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
961 }
962 // update complex phasor - like incrementing phase
963 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
964 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
965 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
966 state.v_sin[j] += k2 * tmp[j];
967 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
968 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
969 }
970 *state_ext = state;
971 }
972
973 PF_TARGET_CLONES
shift_recursive_osc_inp_c(complexf * in_out,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)974 void shift_recursive_osc_inp_c(complexf* in_out,
975 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
976 {
977 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
978 float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
979 float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
980 shift_recursive_osc_t state = *state_ext;
981 const float k1 = conf->k1;
982 const float k2 = conf->k2;
983 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
984 {
985 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
986 {
987 inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
988 inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
989 }
990 //we multiply two complex numbers - similar to shift_math_cc
991 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
992 {
993 iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
994 qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
995 }
996 // update complex phasor - like incrementing phase
997 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
998 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
999 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1000 state.v_sin[j] += k2 * tmp[j];
1001 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1002 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1003 }
1004 *state_ext = state;
1005 }
1006
1007 PF_TARGET_CLONES
gen_recursive_osc_c(complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)1008 void gen_recursive_osc_c(complexf* output,
1009 int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
1010 {
1011 float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
1012 shift_recursive_osc_t state = *state_ext;
1013 const float k1 = conf->k1;
1014 const float k2 = conf->k2;
1015 for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
1016 {
1017 // output complex oscillator value
1018 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1019 {
1020 iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
1021 qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
1022 }
1023 // update complex phasor - like incrementing phase
1024 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1025 tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1026 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1027 state.v_sin[j] += k2 * tmp[j];
1028 for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1029 state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1030 }
1031 *state_ext = state;
1032 }
1033
1034
1035 #ifdef HAVE_SSE_INTRINSICS
1036
1037 /*********************************************************************/
1038
1039 /**************/
1040 /*** ALGO J ***/
1041 /**************/
1042
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1043 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1044 {
1045 // constants for single phase step
1046 float phase_increment_s = rate*PI;
1047 float k1 = tan(0.5*phase_increment_s);
1048 float k2 = 2*k1 /(1 + k1 * k1);
1049 for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
1050 {
1051 float tmp;
1052 state->u_cos[j] = state->u_cos[j-1];
1053 state->v_sin[j] = state->v_sin[j-1];
1054 // small steps
1055 tmp = state->u_cos[j] - k1 * state->v_sin[j];
1056 state->v_sin[j] += k2 * tmp;
1057 state->u_cos[j] = tmp - k1 * state->v_sin[j];
1058 }
1059
1060 // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
1061 float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
1062 while(phase_increment_b > PI) phase_increment_b-=2*PI;
1063 while(phase_increment_b < -PI) phase_increment_b+=2*PI;
1064 conf->k1 = tan(0.5*phase_increment_b);
1065 conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
1066 }
1067
1068
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1069 void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1070 {
1071 if (starting_phase != 0.0F)
1072 {
1073 state->u_cos[0] = cos(starting_phase);
1074 state->v_sin[0] = sin(starting_phase);
1075 }
1076 else
1077 {
1078 state->u_cos[0] = 1.0F;
1079 state->v_sin[0] = 0.0F;
1080 }
1081 shift_recursive_osc_sse_update_rate(rate, conf, state);
1082 }
1083
1084
1085 PF_TARGET_CLONES
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1086 void shift_recursive_osc_sse_inp_c(complexf* in_out,
1087 int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1088 {
1089 const __m128 k1 = LD_PS1( conf->k1 );
1090 const __m128 k2 = LD_PS1( conf->k2 );
1091 __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
1092 __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
1093 __m128 inp_re, inp_im;
1094 __m128 product_re, product_im;
1095 __m128 interl_prod_a, interl_prod_b;
1096 __m128 * RESTRICT u = (__m128*)in_out;
1097
1098 while (N_cplx)
1099 {
1100 //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
1101 //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
1102 UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */
1103
1104 //we multiply two complex numbers - similar to shift_math_cc
1105 //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
1106 //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
1107 product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
1108 product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
1109 INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
1110 VSTORE(u, interl_prod_a);
1111 VSTORE(u+1, interl_prod_b);
1112 u += 2;
1113
1114 // update complex phasor - like incrementing phase
1115 // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1116 product_re = VSUB( u_cos, VMUL(k1, v_sin) );
1117 // state.v_sin[j] += k2 * tmp[j];
1118 v_sin = VADD( v_sin, VMUL(k2, product_re) );
1119 // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1120 u_cos = VSUB( product_re, VMUL(k1, v_sin) );
1121
1122 N_cplx -= 4;
1123 }
1124 VSTORE( &state_ext->u_cos[0], u_cos );
1125 VSTORE( &state_ext->v_sin[0], v_sin );
1126 }
1127
1128 #else
1129
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1130 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1131 {
1132 assert(0);
1133 }
1134
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1135 void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1136 {
1137 assert(0);
1138 }
1139
1140
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1141 void shift_recursive_osc_sse_inp_c(complexf* in_out,
1142 int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1143 {
1144 assert(0);
1145 }
1146
1147 #endif
1148
1149