1 /*
2 This software is part of pffft/pfdsp, a set of simple DSP routines.
3 
4 Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
5 Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
6 All rights reserved.
7 
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
10     * Redistributions of source code must retain the above copyright
11       notice, this list of conditions and the following disclaimer.
12     * Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15     * Neither the name of the copyright holder nor the
16       names of its contributors may be used to endorse or promote products
17       derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /* include own header first, to see missing includes */
32 #include "pf_mixer.h"
33 #include "fmv.h"
34 
35 #include <math.h>
36 #include <stdlib.h>
37 #include <assert.h>
38 
39 //they dropped M_PI in C99, so we define it:
40 #define PI ((float)3.14159265358979323846)
41 
42 //apply to pointers:
43 #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
44 #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
45 
46 #define USE_ALIGNED_ADDRESSES  0
47 
48 
49 
50 /*
51   _____   _____ _____      __                  _   _
52  |  __ \ / ____|  __ \    / _|                | | (_)
53  | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
54  | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
55  | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
56  |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
57 
58 */
59 
60 
61 #if defined(__GNUC__)
62 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
63 #  define RESTRICT __restrict
64 #elif defined(_MSC_VER)
65 #  define ALWAYS_INLINE(return_type) __forceinline return_type
66 #  define RESTRICT __restrict
67 #endif
68 
69 
70 #ifndef PFFFT_SIMD_DISABLE
71 #if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
72   #pragma message "Manual SSE x86/x64 optimizations are ON"
73   #include <xmmintrin.h>
74   #define HAVE_SSE_INTRINSICS 1
75 
76 #elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
77   #pragma message "Manual NEON (arm32) optimizations are ON"
78   #include "sse2neon.h"
79   #define HAVE_SSE_INTRINSICS 1
80 
81 #elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
82   #pragma message "Manual NEON (aarch64) optimizations are ON"
83   #include "sse2neon.h"
84   #define HAVE_SSE_INTRINSICS 1
85 
86 #endif
87 #endif
88 
89 #ifdef HAVE_SSE_INTRINSICS
90 
91 typedef __m128 v4sf;
92 #  define SIMD_SZ 4
93 
94 typedef union v4_union {
95   __m128  v;
96   float f[4];
97 } v4_union;
98 
99 #define VMUL(a,b)                 _mm_mul_ps(a,b)
100 #define VDIV(a,b)                 _mm_div_ps(a,b)
101 #define VADD(a,b)                 _mm_add_ps(a,b)
102 #define VSUB(a,b)                 _mm_sub_ps(a,b)
103 #define LD_PS1(s)                 _mm_set1_ps(s)
104 #define VLOAD_UNALIGNED(ptr)      _mm_loadu_ps((const float *)(ptr))
105 #define VLOAD_ALIGNED(ptr)        _mm_load_ps((const float *)(ptr))
106 #define VSTORE_UNALIGNED(ptr, v)  _mm_storeu_ps((float*)(ptr), v)
107 #define VSTORE_ALIGNED(ptr, v)    _mm_store_ps((float*)(ptr), v)
108 #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
109 #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
110 
111 #if USE_ALIGNED_ADDRESSES
112   #define VLOAD(ptr)              _mm_load_ps((const float *)(ptr))
113   #define VSTORE(ptr, v)          _mm_store_ps((float*)(ptr), v)
114 #else
115   #define VLOAD(ptr)              _mm_loadu_ps((const float *)(ptr))
116   #define VSTORE(ptr, v)          _mm_storeu_ps((float*)(ptr), v)
117 #endif
118 
119 
have_sse_shift_mixer_impl()120 int have_sse_shift_mixer_impl()
121 {
122     return 1;
123 }
124 
125 #else
126 
have_sse_shift_mixer_impl()127 int have_sse_shift_mixer_impl()
128 {
129     return 0;
130 }
131 
132 #endif
133 
134 
135 /*********************************************************************/
136 
137 /**************/
138 /*** ALGO A ***/
139 /**************/
140 
141 PF_TARGET_CLONES
shift_math_cc(complexf * input,complexf * output,int input_size,float rate,float starting_phase)142 float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
143 {
144     rate*=2;
145     //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
146     float phase=starting_phase;
147     float phase_increment=rate*PI;
148     float cosval, sinval;
149     for(int i=0;i<input_size; i++)
150     {
151         cosval=cos(phase);
152         sinval=sin(phase);
153         //we multiply two complex numbers.
154         //how? enter this to maxima (software) for explanation:
155         //   (a+b*%i)*(c+d*%i), rectform;
156         iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
157         qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
158         phase+=phase_increment;
159         while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
160         while(phase<0) phase+=2*PI;
161     }
162     return phase;
163 }
164 
165 /*********************************************************************/
166 
167 /**************/
168 /*** ALGO B ***/
169 /**************/
170 
shift_table_init(int table_size)171 shift_table_data_t shift_table_init(int table_size)
172 {
173     shift_table_data_t output;
174     output.table=(float*)malloc(sizeof(float)*table_size);
175     output.table_size=table_size;
176     for(int i=0;i<table_size;i++)
177     {
178         output.table[i]=sin(((float)i/table_size)*(PI/2));
179     }
180     return output;
181 }
182 
shift_table_deinit(shift_table_data_t table_data)183 void shift_table_deinit(shift_table_data_t table_data)
184 {
185     free(table_data.table);
186 }
187 
188 
189 PF_TARGET_CLONES
shift_table_cc(complexf * input,complexf * output,int input_size,float rate,shift_table_data_t table_data,float starting_phase)190 float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
191 {
192     rate*=2;
193     //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
194     float phase=starting_phase;
195     float phase_increment=rate*PI;
196     float cosval, sinval;
197     for(int i=0;i<input_size; i++) //@shift_math_cc
198     {
199         int sin_index, cos_index, temp_index, sin_sign, cos_sign;
200         int quadrant=phase/(PI/2); //between 0 and 3
201         float vphase=phase-quadrant*(PI/2);
202         sin_index=(vphase/(PI/2))*table_data.table_size;
203         cos_index=table_data.table_size-1-sin_index;
204         if(quadrant&1) //in quadrant 1 and 3
205         {
206             temp_index=sin_index;
207             sin_index=cos_index;
208             cos_index=temp_index;
209         }
210         sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
211         cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
212         sinval=sin_sign*table_data.table[sin_index];
213         cosval=cos_sign*table_data.table[cos_index];
214         //we multiply two complex numbers.
215         //how? enter this to maxima (software) for explanation:
216         //   (a+b*%i)*(c+d*%i), rectform;
217         iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
218         qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
219         phase+=phase_increment;
220         while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
221         while(phase<0) phase+=2*PI;
222     }
223     return phase;
224 }
225 
226 /*********************************************************************/
227 
228 /**************/
229 /*** ALGO C ***/
230 /**************/
231 
shift_addfast_init(float rate)232 shift_addfast_data_t shift_addfast_init(float rate)
233 {
234     shift_addfast_data_t output;
235     output.phase_increment=2*rate*PI;
236     for(int i=0;i<4;i++)
237     {
238         output.dsin[i]=sin(output.phase_increment*(i+1));
239         output.dcos[i]=cos(output.phase_increment*(i+1));
240     }
241     return output;
242 }
243 
244 #define SADF_L1(j) \
245     cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
246     sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
247 #define SADF_L2(j) \
248     iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
249     qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
250 
251 PF_TARGET_CLONES
shift_addfast_cc(complexf * input,complexf * output,int input_size,shift_addfast_data_t * d,float starting_phase)252 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
253 {
254     //input_size should be multiple of 4
255     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
256     float cos_start=cos(starting_phase);
257     float sin_start=sin(starting_phase);
258     float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
259         sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
260         dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
261         dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
262 
263     for(int i=0;i<input_size/4; i++)
264     {
265         SADF_L1(0)
266         SADF_L1(1)
267         SADF_L1(2)
268         SADF_L1(3)
269         SADF_L2(0)
270         SADF_L2(1)
271         SADF_L2(2)
272         SADF_L2(3)
273         cos_start = cos_vals_3;
274         sin_start = sin_vals_3;
275     }
276     starting_phase+=input_size*d->phase_increment;
277     while(starting_phase>PI) starting_phase-=2*PI;
278     while(starting_phase<-PI) starting_phase+=2*PI;
279     return starting_phase;
280 }
281 
282 #undef SADF_L2
283 
284 
285 #define SADF_L2(j) \
286     tmp_inp_cos = iof(in_out,4*i+j); \
287     tmp_inp_sin = qof(in_out,4*i+j); \
288     iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
289     qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
290 
291 PF_TARGET_CLONES
shift_addfast_inp_c(complexf * in_out,int N_cplx,shift_addfast_data_t * d,float starting_phase)292 float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
293 {
294     //input_size should be multiple of 4
295     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
296     float cos_start=cos(starting_phase);
297     float sin_start=sin(starting_phase);
298     float register tmp_inp_cos, tmp_inp_sin,
299         cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
300         sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
301         dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
302         dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
303 
304     for(int i=0;i<N_cplx/4; i++)
305     {
306         SADF_L1(0)
307         SADF_L1(1)
308         SADF_L1(2)
309         SADF_L1(3)
310         SADF_L2(0)
311         SADF_L2(1)
312         SADF_L2(2)
313         SADF_L2(3)
314         cos_start = cos_vals_3;
315         sin_start = sin_vals_3;
316     }
317     starting_phase+=N_cplx*d->phase_increment;
318     while(starting_phase>PI) starting_phase-=2*PI;
319     while(starting_phase<-PI) starting_phase+=2*PI;
320     return starting_phase;
321 }
322 
323 #undef SADF_L1
324 #undef SADF_L2
325 
326 
327 /*********************************************************************/
328 
329 /**************/
330 /*** ALGO D ***/
331 /**************/
332 
shift_unroll_init(float rate,int size)333 shift_unroll_data_t shift_unroll_init(float rate, int size)
334 {
335     shift_unroll_data_t output;
336     output.phase_increment=2*rate*PI;
337     output.size = size;
338     output.dsin=(float*)malloc(sizeof(float)*size);
339     output.dcos=(float*)malloc(sizeof(float)*size);
340     float myphase = 0;
341     for(int i=0;i<size;i++)
342     {
343         myphase += output.phase_increment;
344         while(myphase>PI) myphase-=2*PI;
345         while(myphase<-PI) myphase+=2*PI;
346         output.dsin[i]=sin(myphase);
347         output.dcos[i]=cos(myphase);
348     }
349     return output;
350 }
351 
shift_unroll_deinit(shift_unroll_data_t * d)352 void shift_unroll_deinit(shift_unroll_data_t* d)
353 {
354     if (!d)
355         return;
356     free(d->dsin);
357     free(d->dcos);
358     d->dsin = NULL;
359     d->dcos = NULL;
360 }
361 
362 PF_TARGET_CLONES
shift_unroll_cc(complexf * input,complexf * output,int input_size,shift_unroll_data_t * d,float starting_phase)363 float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
364 {
365     //input_size should be multiple of 4
366     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
367     float cos_start = cos(starting_phase);
368     float sin_start = sin(starting_phase);
369     register float cos_val = cos_start, sin_val = sin_start;
370     for(int i=0;i<input_size; i++)
371     {
372         iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
373         qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
374         // calculate complex phasor for next iteration
375         cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
376         sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
377     }
378     starting_phase+=input_size*d->phase_increment;
379     while(starting_phase>PI) starting_phase-=2*PI;
380     while(starting_phase<-PI) starting_phase+=2*PI;
381     return starting_phase;
382 }
383 
384 PF_TARGET_CLONES
shift_unroll_inp_c(complexf * in_out,int size,shift_unroll_data_t * d,float starting_phase)385 float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
386 {
387     float cos_start = cos(starting_phase);
388     float sin_start = sin(starting_phase);
389     register float cos_val = cos_start, sin_val = sin_start;
390     for(int i=0;i<size; i++)
391     {
392         register float inp_i = iof(in_out,i);
393         register float inp_q = qof(in_out,i);
394         iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
395         qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
396         // calculate complex phasor for next iteration
397         cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
398         sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
399     }
400     starting_phase += size * d->phase_increment;
401     while(starting_phase>PI) starting_phase-=2*PI;
402     while(starting_phase<-PI) starting_phase+=2*PI;
403     return starting_phase;
404 }
405 
406 
407 /*********************************************************************/
408 
409 /**************/
410 /*** ALGO E ***/
411 /**************/
412 
shift_limited_unroll_init(float rate)413 shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
414 {
415     shift_limited_unroll_data_t output;
416     output.phase_increment=2*rate*PI;
417     float myphase = 0;
418     for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
419     {
420         myphase += output.phase_increment;
421         while(myphase>PI) myphase-=2*PI;
422         while(myphase<-PI) myphase+=2*PI;
423         output.dcos[i] = cos(myphase);
424         output.dsin[i] = sin(myphase);
425     }
426     output.complex_phase.i = 1.0F;
427     output.complex_phase.q = 0.0F;
428     return output;
429 }
430 
431 PF_TARGET_CLONES
shift_limited_unroll_cc(const complexf * input,complexf * output,int size,shift_limited_unroll_data_t * d)432 void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
433 {
434     float cos_start = d->complex_phase.i;
435     float sin_start = d->complex_phase.q;
436     register float cos_val = cos_start, sin_val = sin_start, mag;
437     while (size > 0)
438     {
439         int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
440         for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
441         {
442             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
443             {
444                 iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
445                 qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
446                 // calculate complex phasor for next iteration
447                 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
448                 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
449             }
450         }
451         // "starts := vals := vals / |vals|"
452         mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
453         cos_val /= mag;
454         sin_val /= mag;
455         cos_start = cos_val;
456         sin_start = sin_val;
457 
458         input += PF_SHIFT_LIMITED_UNROLL_SIZE;
459         output += PF_SHIFT_LIMITED_UNROLL_SIZE;
460         size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
461     }
462     d->complex_phase.i = cos_val;
463     d->complex_phase.q = sin_val;
464 }
465 
466 PF_TARGET_CLONES
shift_limited_unroll_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_data_t * d)467 void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
468 {
469     float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
470     float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
471     // "vals := starts := phase_state"
472     float cos_start = d->complex_phase.i;
473     float sin_start = d->complex_phase.q;
474     register float cos_val = cos_start, sin_val = sin_start, mag;
475     while (N_cplx)
476     {
477         int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
478         for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
479         {
480             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
481                 inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
482             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
483                 inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
484             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
485             {
486                 // "out[] = inp[] * vals"
487                 iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
488                 qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
489                 // calculate complex phasor for next iteration
490                 // "vals :=  d[] * starts"
491                 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
492                 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
493             }
494         }
495         // "starts := vals := vals / |vals|"
496         mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
497         cos_val /= mag;
498         sin_val /= mag;
499         cos_start = cos_val;
500         sin_start = sin_val;
501 
502         in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
503         N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
504     }
505     // "phase_state := starts"
506     d->complex_phase.i = cos_start;
507     d->complex_phase.q = sin_start;
508 }
509 
510 
511 #ifdef HAVE_SSE_INTRINSICS
512 
513 /*********************************************************************/
514 
515 /**************/
516 /*** ALGO F ***/
517 /**************/
518 
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)519 shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
520 {
521     shift_limited_unroll_A_sse_data_t output;
522     float myphase;
523 
524     output.phase_increment = 2*relative_freq*PI;
525 
526     myphase = 0.0F;
527     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
528     {
529         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
530         {
531             myphase += output.phase_increment;
532             while(myphase>PI) myphase-=2*PI;
533             while(myphase<-PI) myphase+=2*PI;
534         }
535         output.dcos[i] = cos(myphase);
536         output.dsin[i] = sin(myphase);
537         for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
538         {
539             output.dcos[i+k] = output.dcos[i];
540             output.dsin[i+k] = output.dsin[i];
541         }
542     }
543 
544     output.dcos_blk = 0.0F;
545     output.dsin_blk = 0.0F;
546 
547     myphase = phase_start_rad;
548     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
549     {
550         output.phase_state_i[i] = cos(myphase);
551         output.phase_state_q[i] = sin(myphase);
552         myphase += output.phase_increment;
553         while(myphase>PI) myphase-=2*PI;
554         while(myphase<-PI) myphase+=2*PI;
555     }
556     return output;
557 }
558 
559 
560 PF_TARGET_CLONES
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)561 void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
562 {
563     // "vals := starts := phase_state"
564     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
565     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
566     __m128 cos_vals = cos_starts;
567     __m128 sin_vals = sin_starts;
568     __m128 inp_re, inp_im;
569     __m128 product_re, product_im;
570     __m128 interl_prod_a, interl_prod_b;
571     __m128 * RESTRICT p_trig_cos_tab;
572     __m128 * RESTRICT p_trig_sin_tab;
573     __m128 * RESTRICT u = (__m128*)in_out;
574 
575     while (N_cplx)
576     {
577         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
578         int B = NB;
579         p_trig_cos_tab = (__m128*)( &d->dcos[0] );
580         p_trig_sin_tab = (__m128*)( &d->dsin[0] );
581         while (B)
582         {
583             // complex multiplication of 4 complex values from/to in_out[]
584             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
585             // "out[] = inp[] * vals"
586             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
587             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
588             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
589             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
590             VSTORE(u, interl_prod_a);
591             VSTORE(u+1, interl_prod_b);
592             u += 2;
593             // calculate complex phasor for next iteration
594             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
595             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
596             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
597             // "vals :=  d[] * starts"
598             inp_re = VLOAD(p_trig_cos_tab);
599             inp_im = VLOAD(p_trig_sin_tab);
600             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
601             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
602             ++p_trig_cos_tab;
603             ++p_trig_sin_tab;
604             B -= 4;
605         }
606         N_cplx -= NB;
607         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
608         /* re-use product_re[]/product_im[] for normalization */
609         // "starts := vals := vals / |vals|"
610         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
611 #if 0
612         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
613         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
614         product_im = _mm_rsqrt_ps(product_re);
615         cos_starts = cos_vals = VMUL(cos_vals, product_im);
616         sin_starts = sin_vals = VMUL(sin_vals, product_im);
617 #else
618         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
619         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
620         product_im = _mm_sqrt_ps(product_re);
621         cos_starts = cos_vals = VDIV(cos_vals, product_im);
622         sin_starts = sin_vals = VDIV(sin_vals, product_im);
623 #endif
624     }
625     // "phase_state := starts"
626     VSTORE( &d->phase_state_i[0], cos_starts );
627     VSTORE( &d->phase_state_q[0], sin_starts );
628 }
629 
630 
631 /*********************************************************************/
632 
633 /**************/
634 /*** ALGO G ***/
635 /**************/
636 
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)637 shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
638 {
639     shift_limited_unroll_B_sse_data_t output;
640     float myphase;
641 
642     output.phase_increment = 2*relative_freq*PI;
643 
644     myphase = 0.0F;
645     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
646     {
647         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
648         {
649             myphase += output.phase_increment;
650             while(myphase>PI) myphase-=2*PI;
651             while(myphase<-PI) myphase+=2*PI;
652         }
653         output.dtrig[i+0] = cos(myphase);
654         output.dtrig[i+1] = sin(myphase);
655         output.dtrig[i+2] = output.dtrig[i+0];
656         output.dtrig[i+3] = output.dtrig[i+1];
657     }
658 
659     output.dcos_blk = 0.0F;
660     output.dsin_blk = 0.0F;
661 
662     myphase = phase_start_rad;
663     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
664     {
665         output.phase_state_i[i] = cos(myphase);
666         output.phase_state_q[i] = sin(myphase);
667         myphase += output.phase_increment;
668         while(myphase>PI) myphase-=2*PI;
669         while(myphase<-PI) myphase+=2*PI;
670     }
671     return output;
672 }
673 
674 
675 PF_TARGET_CLONES
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)676 void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
677 {
678     // "vals := starts := phase_state"
679     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
680     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
681     __m128 cos_vals = cos_starts;
682     __m128 sin_vals = sin_starts;
683     __m128 inp_re, inp_im;
684     __m128 product_re, product_im;
685     __m128 interl_prod_a, interl_prod_b;
686     __m128 * RESTRICT p_trig_tab;
687     __m128 * RESTRICT u = (__m128*)in_out;
688 
689     while (N_cplx)
690     {
691         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
692         int B = NB;
693         p_trig_tab = (__m128*)( &d->dtrig[0] );
694         while (B)
695         {
696             // complex multiplication of 4 complex values from/to in_out[]
697             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
698             // "out[] = inp[] * vals"
699             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
700             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
701             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
702             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
703             VSTORE(u, interl_prod_a);
704             VSTORE(u+1, interl_prod_b);
705             u += 2;
706             // calculate complex phasor for next iteration
707             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
708             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
709             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
710             // "vals :=  d[] * starts"
711             product_re = VLOAD(p_trig_tab);
712             UNINTERLEAVE2(product_re, product_re, inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
713             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
714             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
715             ++p_trig_tab;
716             B -= 4;
717         }
718         N_cplx -= NB;
719         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
720         /* re-use product_re[]/product_im[] for normalization */
721         // "starts := vals := vals / |vals|"
722         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
723 #if 0
724         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
725         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
726         product_im = _mm_rsqrt_ps(product_re);
727         cos_starts = cos_vals = VMUL(cos_vals, product_im);
728         sin_starts = sin_vals = VMUL(sin_vals, product_im);
729 #else
730         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
731         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
732         product_im = _mm_sqrt_ps(product_re);
733         cos_starts = cos_vals = VDIV(cos_vals, product_im);
734         sin_starts = sin_vals = VDIV(sin_vals, product_im);
735 #endif
736     }
737     // "phase_state := starts"
738     VSTORE( &d->phase_state_i[0], cos_starts );
739     VSTORE( &d->phase_state_q[0], sin_starts );
740 }
741 
742 
743 /*********************************************************************/
744 
745 
746 /**************/
747 /*** ALGO H ***/
748 /**************/
749 
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)750 shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
751 {
752     shift_limited_unroll_C_sse_data_t output;
753     float myphase;
754 
755     output.phase_increment = 2*relative_freq*PI;
756 
757     myphase = 0.0F;
758     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
759     {
760         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
761         {
762             myphase += output.phase_increment;
763             while(myphase>PI) myphase-=2*PI;
764             while(myphase<-PI) myphase+=2*PI;
765         }
766         output.dinterl_trig[2*i] = cos(myphase);
767         output.dinterl_trig[2*i+4] = sin(myphase);
768         for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
769         {
770             output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
771             output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
772         }
773     }
774 
775     output.dcos_blk = 0.0F;
776     output.dsin_blk = 0.0F;
777 
778     myphase = phase_start_rad;
779     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
780     {
781         output.phase_state_i[i] = cos(myphase);
782         output.phase_state_q[i] = sin(myphase);
783         myphase += output.phase_increment;
784         while(myphase>PI) myphase-=2*PI;
785         while(myphase<-PI) myphase+=2*PI;
786     }
787     return output;
788 }
789 
790 
791 PF_TARGET_CLONES
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)792 void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
793 {
794     // "vals := starts := phase_state"
795     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
796     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
797     __m128 cos_vals = cos_starts;
798     __m128 sin_vals = sin_starts;
799     __m128 inp_re, inp_im;
800     __m128 product_re, product_im;
801     __m128 interl_prod_a, interl_prod_b;
802     __m128 * RESTRICT p_trig_tab;
803     __m128 * RESTRICT u = (__m128*)in_out;
804 
805     while (N_cplx)
806     {
807         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
808         int B = NB;
809         p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
810         while (B)
811         {
812             // complex multiplication of 4 complex values from/to in_out[]
813             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
814             // "out[] = inp[] * vals"
815             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
816             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
817             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
818             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
819             VSTORE(u, interl_prod_a);
820             VSTORE(u+1, interl_prod_b);
821             u += 2;
822             // calculate complex phasor for next iteration
823             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
824             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
825             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
826             // "vals :=  d[] * starts"
827             inp_re = VLOAD(p_trig_tab);
828             inp_im = VLOAD(p_trig_tab+1);
829             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
830             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
831             p_trig_tab += 2;
832             B -= 4;
833         }
834         N_cplx -= NB;
835         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
836         /* re-use product_re[]/product_im[] for normalization */
837         // "starts := vals := vals / |vals|"
838         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
839 #if 0
840         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
841         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
842         product_im = _mm_rsqrt_ps(product_re);
843         cos_starts = cos_vals = VMUL(cos_vals, product_im);
844         sin_starts = sin_vals = VMUL(sin_vals, product_im);
845 #else
846         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
847         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
848         product_im = _mm_sqrt_ps(product_re);
849         cos_starts = cos_vals = VDIV(cos_vals, product_im);
850         sin_starts = sin_vals = VDIV(sin_vals, product_im);
851 #endif
852     }
853     // "phase_state := starts"
854     VSTORE( &d->phase_state_i[0], cos_starts );
855     VSTORE( &d->phase_state_q[0], sin_starts );
856 }
857 
858 
859 #else
860 
861 /*********************************************************************/
862 
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)863 shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
864     assert(0);
865     shift_limited_unroll_A_sse_data_t r;
866     return r;
867 }
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)868 shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
869     assert(0);
870     shift_limited_unroll_B_sse_data_t r;
871     return r;
872 }
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)873 shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
874     assert(0);
875     shift_limited_unroll_C_sse_data_t r;
876     return r;
877 }
878 
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)879 void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
880     assert(0);
881 }
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)882 void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
883     assert(0);
884 }
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)885 void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
886     assert(0);
887 }
888 
889 #endif
890 
891 
892 /*********************************************************************/
893 
894 /**************/
895 /*** ALGO I ***/
896 /**************/
897 
shift_recursive_osc_update_rate(float rate,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)898 void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
899 {
900     // constants for single phase step
901     float phase_increment_s = rate*PI;
902     float k1 = tan(0.5*phase_increment_s);
903     float k2 = 2*k1 /(1 + k1 * k1);
904     for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
905     {
906         float tmp;
907         state->u_cos[j] = state->u_cos[j-1];
908         state->v_sin[j] = state->v_sin[j-1];
909         // small steps
910         tmp = state->u_cos[j] - k1 * state->v_sin[j];
911         state->v_sin[j] += k2 * tmp;
912         state->u_cos[j] = tmp - k1 * state->v_sin[j];
913     }
914 
915     // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
916     float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
917     while(phase_increment_b > PI) phase_increment_b-=2*PI;
918     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
919     conf->k1 = tan(0.5*phase_increment_b);
920     conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
921 }
922 
shift_recursive_osc_init(float rate,float starting_phase,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)923 void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
924 {
925     if (starting_phase != 0.0F)
926     {
927         state->u_cos[0] = cos(starting_phase);
928         state->v_sin[0] = sin(starting_phase);
929     }
930     else
931     {
932         state->u_cos[0] = 1.0F;
933         state->v_sin[0] = 0.0F;
934     }
935     shift_recursive_osc_update_rate(rate, conf, state);
936 }
937 
938 
939 PF_TARGET_CLONES
shift_recursive_osc_cc(const complexf * input,complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)940 void shift_recursive_osc_cc(const complexf *input, complexf* output,
941     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
942 {
943     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
944     float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
945     float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
946     shift_recursive_osc_t state = *state_ext;
947     const float k1 = conf->k1;
948     const float k2 = conf->k2;
949     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
950     {
951         //we multiply two complex numbers - similar to shift_math_cc
952         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
953         {
954             inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
955             inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
956         }
957         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
958         {
959             iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
960             qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
961         }
962         // update complex phasor - like incrementing phase
963         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
964             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
965         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
966             state.v_sin[j] += k2 * tmp[j];
967         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
968             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
969     }
970     *state_ext = state;
971 }
972 
973 PF_TARGET_CLONES
shift_recursive_osc_inp_c(complexf * in_out,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)974 void shift_recursive_osc_inp_c(complexf* in_out,
975     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
976 {
977     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
978     float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
979     float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
980     shift_recursive_osc_t state = *state_ext;
981     const float k1 = conf->k1;
982     const float k2 = conf->k2;
983     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
984     {
985         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
986         {
987             inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
988             inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
989         }
990         //we multiply two complex numbers - similar to shift_math_cc
991         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
992         {
993             iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
994             qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
995         }
996         // update complex phasor - like incrementing phase
997         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
998             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
999         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1000             state.v_sin[j] += k2 * tmp[j];
1001         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1002             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1003     }
1004     *state_ext = state;
1005 }
1006 
1007 PF_TARGET_CLONES
gen_recursive_osc_c(complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)1008 void gen_recursive_osc_c(complexf* output,
1009     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
1010 {
1011     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
1012     shift_recursive_osc_t state = *state_ext;
1013     const float k1 = conf->k1;
1014     const float k2 = conf->k2;
1015     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
1016     {
1017         // output complex oscillator value
1018         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1019         {
1020             iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
1021             qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
1022         }
1023         // update complex phasor - like incrementing phase
1024         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1025             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1026         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1027             state.v_sin[j] += k2 * tmp[j];
1028         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1029             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1030     }
1031     *state_ext = state;
1032 }
1033 
1034 
1035 #ifdef HAVE_SSE_INTRINSICS
1036 
1037 /*********************************************************************/
1038 
1039 /**************/
1040 /*** ALGO J ***/
1041 /**************/
1042 
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1043 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1044 {
1045     // constants for single phase step
1046     float phase_increment_s = rate*PI;
1047     float k1 = tan(0.5*phase_increment_s);
1048     float k2 = 2*k1 /(1 + k1 * k1);
1049     for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
1050     {
1051         float tmp;
1052         state->u_cos[j] = state->u_cos[j-1];
1053         state->v_sin[j] = state->v_sin[j-1];
1054         // small steps
1055         tmp = state->u_cos[j] - k1 * state->v_sin[j];
1056         state->v_sin[j] += k2 * tmp;
1057         state->u_cos[j] = tmp - k1 * state->v_sin[j];
1058     }
1059 
1060     // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
1061     float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
1062     while(phase_increment_b > PI) phase_increment_b-=2*PI;
1063     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
1064     conf->k1 = tan(0.5*phase_increment_b);
1065     conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
1066 }
1067 
1068 
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1069 void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1070 {
1071     if (starting_phase != 0.0F)
1072     {
1073         state->u_cos[0] = cos(starting_phase);
1074         state->v_sin[0] = sin(starting_phase);
1075     }
1076     else
1077     {
1078         state->u_cos[0] = 1.0F;
1079         state->v_sin[0] = 0.0F;
1080     }
1081     shift_recursive_osc_sse_update_rate(rate, conf, state);
1082 }
1083 
1084 
1085 PF_TARGET_CLONES
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1086 void shift_recursive_osc_sse_inp_c(complexf* in_out,
1087     int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1088 {
1089     const __m128 k1 = LD_PS1( conf->k1 );
1090     const __m128 k2 = LD_PS1( conf->k2 );
1091     __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
1092     __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
1093     __m128 inp_re, inp_im;
1094     __m128 product_re, product_im;
1095     __m128 interl_prod_a, interl_prod_b;
1096     __m128 * RESTRICT u = (__m128*)in_out;
1097 
1098     while (N_cplx)
1099     {
1100         //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
1101         //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
1102         UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
1103 
1104         //we multiply two complex numbers - similar to shift_math_cc
1105         //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
1106         //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
1107         product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
1108         product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
1109         INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
1110         VSTORE(u, interl_prod_a);
1111         VSTORE(u+1, interl_prod_b);
1112         u += 2;
1113 
1114         // update complex phasor - like incrementing phase
1115         // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1116         product_re = VSUB( u_cos, VMUL(k1, v_sin) );
1117         // state.v_sin[j] += k2 * tmp[j];
1118         v_sin = VADD( v_sin, VMUL(k2, product_re) );
1119         // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1120         u_cos = VSUB( product_re, VMUL(k1, v_sin) );
1121 
1122         N_cplx -= 4;
1123     }
1124     VSTORE( &state_ext->u_cos[0], u_cos );
1125     VSTORE( &state_ext->v_sin[0], v_sin );
1126 }
1127 
1128 #else
1129 
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1130 void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1131 {
1132     assert(0);
1133 }
1134 
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1135 void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1136 {
1137     assert(0);
1138 }
1139 
1140 
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1141 void shift_recursive_osc_sse_inp_c(complexf* in_out,
1142     int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1143 {
1144     assert(0);
1145 }
1146 
1147 #endif
1148 
1149