1 /*
2   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
3 
4   bench for mixer algorithm/implementations
5 
6  */
7 
8 #include <pf_mixer.h>
9 
10 #include <math.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <time.h>
14 #include <assert.h>
15 #include <string.h>
16 
17 #define HAVE_SYS_TIMES
18 
19 #ifdef HAVE_SYS_TIMES
20 #  include <sys/times.h>
21 #  include <unistd.h>
22 #endif
23 
24 #define BENCH_REF_TRIG_FUNC       1
25 #define BENCH_OUT_OF_PLACE_ALGOS  0
26 #define BENCH_INPLACE_ALGOS       1
27 
28 #define SAVE_BY_DEFAULT  0
29 #define SAVE_LIMIT_MSPS           16
30 
31 #if 0
32   #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
33   #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
34   #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
35   #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
36   #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
37   #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
38   #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
39   #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
40   #define BENCH_FILE_REC_OSC_CC              ""
41   #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
42   #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
43 #else
44   #define BENCH_FILE_SHIFT_MATH_CC           ""
45   #define BENCH_FILE_ADD_FAST_CC             ""
46   #define BENCH_FILE_ADD_FAST_INP_C          ""
47   #define BENCH_FILE_UNROLL_INP_C            ""
48   #define BENCH_FILE_LTD_UNROLL_INP_C        ""
49   #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
50   #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
51   #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
52   #define BENCH_FILE_REC_OSC_CC              ""
53   #define BENCH_FILE_REC_OSC_INP_C           ""
54   #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
55 #endif
56 
57 
58 
59 #if defined(HAVE_SYS_TIMES)
60     static double ttclk = 0.;
61 
uclock_sec(int find_start)62     static double uclock_sec(int find_start)
63     {
64         struct tms t0, t;
65         if (ttclk == 0.)
66         {
67             ttclk = sysconf(_SC_CLK_TCK);
68             fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
69         }
70         times(&t);
71         if (find_start)
72         {
73             t0 = t;
74             while (t0.tms_utime == t.tms_utime)
75                 times(&t);
76         }
77         /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
78         return ((double)t.tms_utime) / ttclk;
79     }
80 
81 #elif 0
82     // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
uclock_sec(int find_start)83     double uclock_sec(int find_start)
84     {
85         FILETIME a, b, c, d;
86         if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
87         {
88             //  Returns total user time.
89             //  Can be tweaked to include kernel times as well.
90             return
91                 (double)(d.dwLowDateTime |
92                     ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
93         }
94         else {
95             //  Handle error
96             return 0;
97         }
98     }
99 
100 #else
uclock_sec(int find_start)101     double uclock_sec(int find_start)
102     { return (double)clock()/(double)CLOCKS_PER_SEC; }
103 #endif
104 
105 
save(complexf * d,int B,int N,const char * fn)106 void save(complexf * d, int B, int N, const char * fn)
107 {
108     if (!fn || !fn[0])
109     {
110         if (! SAVE_BY_DEFAULT)
111             return;
112         fn = "/dev/shm/bench.bin";
113     }
114     FILE * f = fopen(fn, "wb");
115     if (!f) {
116         fprintf(stderr, "error writing result to %s\n", fn);
117         return;
118     }
119     if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
120         N = SAVE_LIMIT_MSPS * 1024 * 1024;
121     for (int off = 0; off + B <= N; off += B)
122     {
123         fwrite(d+off, sizeof(complexf), B, f);
124     }
125     fclose(f);
126 }
127 
128 
bench_shift_math_cc(int B,int N)129 double bench_shift_math_cc(int B, int N) {
130     double t0, t1, tstop, T, nI;
131     int iter, off;
132     float phase = 0.0F;
133     complexf *input = (complexf *)malloc(N * sizeof(complexf));
134     complexf *output = (complexf *)malloc(N * sizeof(complexf));
135     shift_recursive_osc_t gen_state;
136     shift_recursive_osc_conf_t gen_conf;
137 
138     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
139     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
140 
141     iter = 0;
142     off = 0;
143     t0 = uclock_sec(1);
144     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
145     do {
146         // work
147         phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
148         off += B;
149         ++iter;
150         t1 = uclock_sec(0);
151     } while ( t1 < tstop && off + B < N );
152 
153     save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
154 
155     free(input);
156     free(output);
157     T = ( t1 - t0 );  /* duration per fft() */
158     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
159     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
160     return (nI / T);    /* normalized iterations per second */
161 }
162 
163 
bench_shift_table_cc(int B,int N)164 double bench_shift_table_cc(int B, int N) {
165     double t0, t1, tstop, T, nI;
166     int iter, off;
167     int table_size=65536;
168     float phase = 0.0F;
169     complexf *input = (complexf *)malloc(N * sizeof(complexf));
170     complexf *output = (complexf *)malloc(N * sizeof(complexf));
171     shift_recursive_osc_t gen_state;
172     shift_recursive_osc_conf_t gen_conf;
173 
174     shift_table_data_t table_data = shift_table_init(table_size);
175 
176     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
177     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
178 
179     iter = 0;
180     off = 0;
181     t0 = uclock_sec(1);
182     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
183     do {
184         // work
185         phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
186 
187         off += B;
188         ++iter;
189         t1 = uclock_sec(0);
190     } while ( t1 < tstop && off + B < N );
191 
192     save(output, B, off, NULL);
193     free(input);
194     free(output);
195     T = ( t1 - t0 );  /* duration per fft() */
196     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
197     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
198     return (nI / T);    /* normalized iterations per second */
199 }
200 
201 
bench_shift_addfast(int B,int N)202 double bench_shift_addfast(int B, int N) {
203     double t0, t1, tstop, T, nI;
204     int iter, off;
205     float phase = 0.0F;
206     complexf *input = (complexf *)malloc(N * sizeof(complexf));
207     complexf *output = (complexf *)malloc(N * sizeof(complexf));
208     shift_recursive_osc_t gen_state;
209     shift_recursive_osc_conf_t gen_conf;
210     shift_addfast_data_t state = shift_addfast_init(-0.0009F);
211 
212     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
213     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
214 
215     iter = 0;
216     off = 0;
217     t0 = uclock_sec(1);
218     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
219     do {
220         // work
221         phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
222 
223         off += B;
224         ++iter;
225         t1 = uclock_sec(0);
226     } while ( t1 < tstop && off + B < N );
227 
228     save(output, B, off, BENCH_FILE_ADD_FAST_CC);
229 
230     free(input);
231     free(output);
232     T = ( t1 - t0 );  /* duration per fft() */
233     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
234     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
235     return (nI / T);    /* normalized iterations per second */
236 }
237 
bench_shift_addfast_inp(int B,int N)238 double bench_shift_addfast_inp(int B, int N) {
239     double t0, t1, tstop, T, nI;
240     int iter, off;
241     float phase = 0.0F;
242     complexf *input = (complexf *)malloc(N * sizeof(complexf));
243     shift_recursive_osc_t gen_state;
244     shift_recursive_osc_conf_t gen_conf;
245     shift_addfast_data_t state = shift_addfast_init(-0.0009F);
246 
247     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
248     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
249 
250     iter = 0;
251     off = 0;
252     t0 = uclock_sec(1);
253     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
254     do {
255         // work
256         phase = shift_addfast_inp_c(input+off, B, &state, phase);
257 
258         off += B;
259         ++iter;
260         t1 = uclock_sec(0);
261     } while ( t1 < tstop && off + B < N );
262 
263     save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
264 
265     free(input);
266     T = ( t1 - t0 );  /* duration per fft() */
267     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
268     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
269     return (nI / T);    /* normalized iterations per second */
270 }
271 
272 
bench_shift_unroll_oop(int B,int N)273 double bench_shift_unroll_oop(int B, int N) {
274     double t0, t1, tstop, T, nI;
275     int iter, off;
276     float phase = 0.0F;
277     complexf *input = (complexf *)malloc(N * sizeof(complexf));
278     complexf *output = (complexf *)malloc(N * sizeof(complexf));
279     shift_recursive_osc_t gen_state;
280     shift_recursive_osc_conf_t gen_conf;
281     shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
282 
283     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
284     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
285 
286     iter = 0;
287     off = 0;
288     t0 = uclock_sec(1);
289     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
290     do {
291         // work
292         phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
293 
294         off += B;
295         ++iter;
296         t1 = uclock_sec(0);
297     } while ( t1 < tstop && off + B < N );
298 
299     save(output, B, off, NULL);
300     free(input);
301     free(output);
302     T = ( t1 - t0 );  /* duration per fft() */
303     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
304     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
305     return (nI / T);    /* normalized iterations per second */
306 }
307 
bench_shift_unroll_inp(int B,int N)308 double bench_shift_unroll_inp(int B, int N) {
309     double t0, t1, tstop, T, nI;
310     int iter, off;
311     float phase = 0.0F;
312     complexf *input = (complexf *)malloc(N * sizeof(complexf));
313     shift_recursive_osc_t gen_state;
314     shift_recursive_osc_conf_t gen_conf;
315     shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
316 
317     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
318     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
319 
320     iter = 0;
321     off = 0;
322     t0 = uclock_sec(1);
323     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
324     do {
325         // work
326         phase = shift_unroll_inp_c(input+off, B, &state, phase);
327 
328         off += B;
329         ++iter;
330         t1 = uclock_sec(0);
331     } while ( t1 < tstop && off + B < N );
332 
333     save(input, B, off, BENCH_FILE_UNROLL_INP_C);
334 
335     free(input);
336     T = ( t1 - t0 );  /* duration per fft() */
337     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
338     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
339     return (nI / T);    /* normalized iterations per second */
340 }
341 
342 
343 
bench_shift_limited_unroll_oop(int B,int N)344 double bench_shift_limited_unroll_oop(int B, int N) {
345     double t0, t1, tstop, T, nI;
346     int iter, off;
347     complexf *input = (complexf *)malloc(N * sizeof(complexf));
348     complexf *output = (complexf *)malloc(N * sizeof(complexf));
349     shift_recursive_osc_t gen_state;
350     shift_recursive_osc_conf_t gen_conf;
351     shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
352 
353     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
354     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
355 
356     iter = 0;
357     off = 0;
358     t0 = uclock_sec(1);
359     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
360     do {
361         // work
362         shift_limited_unroll_cc(input+off, output+off, B, &state);
363 
364         off += B;
365         ++iter;
366         t1 = uclock_sec(0);
367     } while ( t1 < tstop && off + B < N );
368 
369     save(output, B, off, NULL);
370     free(input);
371     free(output);
372     T = ( t1 - t0 );  /* duration per fft() */
373     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
374     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
375     return (nI / T);    /* normalized iterations per second */
376 }
377 
378 
bench_shift_limited_unroll_inp(int B,int N)379 double bench_shift_limited_unroll_inp(int B, int N) {
380     double t0, t1, tstop, T, nI;
381     int iter, off;
382     complexf *input = (complexf *)malloc(N * sizeof(complexf));
383     shift_recursive_osc_t gen_state;
384     shift_recursive_osc_conf_t gen_conf;
385     shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
386 
387     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
388     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
389 
390     iter = 0;
391     off = 0;
392     t0 = uclock_sec(1);
393     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
394     do {
395         // work
396         shift_limited_unroll_inp_c(input+off, B, &state);
397 
398         off += B;
399         ++iter;
400         t1 = uclock_sec(0);
401     } while ( t1 < tstop && off + B < N );
402 
403     save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
404 
405     free(input);
406     T = ( t1 - t0 );  /* duration per fft() */
407     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
408     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
409     return (nI / T);    /* normalized iterations per second */
410 }
411 
412 
bench_shift_limited_unroll_A_sse_inp(int B,int N)413 double bench_shift_limited_unroll_A_sse_inp(int B, int N) {
414     double t0, t1, tstop, T, nI;
415     int iter, off;
416     complexf *input = (complexf *)malloc(N * sizeof(complexf));
417     shift_recursive_osc_t gen_state;
418     shift_recursive_osc_conf_t gen_conf;
419     shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t));
420 
421     *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
422 
423     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
424     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
425 
426     iter = 0;
427     off = 0;
428     t0 = uclock_sec(1);
429     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
430     do {
431         // work
432         shift_limited_unroll_A_sse_inp_c(input+off, B, state);
433 
434         off += B;
435         ++iter;
436         t1 = uclock_sec(0);
437     } while ( t1 < tstop && off + B < N );
438 
439     save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
440 
441     free(input);
442     T = ( t1 - t0 );  /* duration per fft() */
443     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
444     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
445     return (nI / T);    /* normalized iterations per second */
446 }
447 
bench_shift_limited_unroll_B_sse_inp(int B,int N)448 double bench_shift_limited_unroll_B_sse_inp(int B, int N) {
449     double t0, t1, tstop, T, nI;
450     int iter, off;
451     complexf *input = (complexf *)malloc(N * sizeof(complexf));
452     shift_recursive_osc_t gen_state;
453     shift_recursive_osc_conf_t gen_conf;
454     shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t));
455 
456     *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
457 
458     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
459     //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
460     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
461 
462     iter = 0;
463     off = 0;
464     t0 = uclock_sec(1);
465     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
466     do {
467         // work
468         shift_limited_unroll_B_sse_inp_c(input+off, B, state);
469 
470         off += B;
471         ++iter;
472         t1 = uclock_sec(0);
473     } while ( t1 < tstop && off + B < N );
474 
475     save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
476 
477     free(input);
478     T = ( t1 - t0 );  /* duration per fft() */
479     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
480     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
481     return (nI / T);    /* normalized iterations per second */
482 }
483 
bench_shift_limited_unroll_C_sse_inp(int B,int N)484 double bench_shift_limited_unroll_C_sse_inp(int B, int N) {
485     double t0, t1, tstop, T, nI;
486     int iter, off;
487     complexf *input = (complexf *)malloc(N * sizeof(complexf));
488     shift_recursive_osc_t gen_state;
489     shift_recursive_osc_conf_t gen_conf;
490     shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t));
491 
492     *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
493 
494     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
495     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
496 
497     iter = 0;
498     off = 0;
499     t0 = uclock_sec(1);
500     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
501     do {
502         // work
503         shift_limited_unroll_C_sse_inp_c(input+off, B, state);
504 
505         off += B;
506         ++iter;
507         t1 = uclock_sec(0);
508     } while ( t1 < tstop && off + B < N );
509 
510     save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
511 
512     free(input);
513     T = ( t1 - t0 );  /* duration per fft() */
514     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
515     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
516     return (nI / T);    /* normalized iterations per second */
517 }
518 
519 
bench_shift_rec_osc_cc_oop(int B,int N)520 double bench_shift_rec_osc_cc_oop(int B, int N) {
521     double t0, t1, tstop, T, nI;
522     int iter, off;
523     float phase = 0.0F;
524     complexf *input = (complexf *)malloc(N * sizeof(complexf));
525     complexf *output = (complexf *)malloc(N * sizeof(complexf));
526     shift_recursive_osc_t gen_state, shift_state;
527     shift_recursive_osc_conf_t gen_conf, shift_conf;
528 
529     shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
530     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
531     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
532 
533     iter = 0;
534     off = 0;
535     t0 = uclock_sec(1);
536     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
537     do {
538         // work
539         shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
540 
541         off += B;
542         ++iter;
543         t1 = uclock_sec(0);
544     } while ( t1 < tstop && off + B < N );
545 
546     save(input, B, off, BENCH_FILE_REC_OSC_CC);
547 
548     save(output, B, off, NULL);
549     free(input);
550     free(output);
551     T = ( t1 - t0 );  /* duration per fft() */
552     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
553     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
554     return (nI / T);    /* normalized iterations per second */
555 }
556 
557 
bench_shift_rec_osc_cc_inp(int B,int N)558 double bench_shift_rec_osc_cc_inp(int B, int N) {
559     double t0, t1, tstop, T, nI;
560     int iter, off;
561     float phase = 0.0F;
562     complexf *input = (complexf *)malloc(N * sizeof(complexf));
563     shift_recursive_osc_t gen_state, shift_state;
564     shift_recursive_osc_conf_t gen_conf, shift_conf;
565 
566     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
567     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
568     shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
569 
570     iter = 0;
571     off = 0;
572     t0 = uclock_sec(1);
573     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
574     do {
575         // work
576         shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state);
577 
578         off += B;
579         ++iter;
580         t1 = uclock_sec(0);
581     } while ( t1 < tstop && off + B < N );
582 
583     save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
584     free(input);
585     T = ( t1 - t0 );  /* duration per fft() */
586     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
587     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
588     return (nI / T);    /* normalized iterations per second */
589 }
590 
591 
bench_shift_rec_osc_sse_c_inp(int B,int N)592 double bench_shift_rec_osc_sse_c_inp(int B, int N) {
593     double t0, t1, tstop, T, nI;
594     int iter, off;
595     float phase = 0.0F;
596     complexf *input = (complexf *)malloc(N * sizeof(complexf));
597     shift_recursive_osc_t gen_state;
598     shift_recursive_osc_conf_t gen_conf;
599 
600     shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t));
601     shift_recursive_osc_sse_conf_t shift_conf;
602 
603     shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
604     gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
605 
606     shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
607 
608     iter = 0;
609     off = 0;
610     t0 = uclock_sec(1);
611     tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
612     do {
613         // work
614         shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state);
615 
616         off += B;
617         ++iter;
618         t1 = uclock_sec(0);
619     } while ( t1 < tstop && off + B < N );
620 
621     save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
622     free(input);
623     T = ( t1 - t0 );  /* duration per fft() */
624     printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
625     nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
626     return (nI / T);    /* normalized iterations per second */
627 }
628 
629 
630 
main(int argc,char ** argv)631 int main(int argc, char **argv)
632 {
633     double rt;
634 
635     // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
636     int B = 8 * 1024;
637     int N = 64 * 1024 * 1024;
638     int showUsage = 0;
639 
640     if (argc == 1)
641         showUsage = 1;
642 
643     if (1 < argc)
644         B = atoi(argv[1]);
645     if (2 < argc)
646         N = atoi(argv[2]) * 1024 * 1024;
647 
648     if ( !B || !N || showUsage )
649     {
650         fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
651         if ( !B || !N )
652             return 0;
653     }
654 
655     fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n",
656         N / (1024 * 1024), B );
657 
658 
659 #if BENCH_REF_TRIG_FUNC
660     printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
661     rt = bench_shift_math_cc(B, N);
662     printf("  %f MSamples/sec\n\n", rt * 1E-6);
663 #endif
664 
665 #if BENCH_OUT_OF_PLACE_ALGOS
666     printf("starting bench of shift_table_cc (out-of-place) ..\n");
667     rt = bench_shift_table_cc(B, N);
668     printf("  %f MSamples/sec\n\n", rt * 1E-6);
669 
670     printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
671     rt = bench_shift_addfast(B, N);
672     printf("  %f MSamples/sec\n\n", rt * 1E-6);
673 
674     printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
675     rt = bench_shift_unroll_oop(B, N);
676     printf("  %f MSamples/sec\n\n", rt * 1E-6);
677 
678     printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
679     rt = bench_shift_limited_unroll_oop(B, N);
680     printf("  %f MSamples/sec\n\n", rt * 1E-6);
681 
682     printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
683     rt = bench_shift_rec_osc_cc_oop(B, N);
684     printf("  %f MSamples/sec\n\n", rt * 1E-6);
685 #endif
686 
687 #if BENCH_INPLACE_ALGOS
688 
689     printf("starting bench of shift_addfast_inp_c in-place ..\n");
690     rt = bench_shift_addfast_inp(B, N);
691     printf("  %f MSamples/sec\n\n", rt * 1E-6);
692 
693     printf("starting bench of shift_unroll_inp_c in-place ..\n");
694     rt = bench_shift_unroll_inp(B, N);
695     printf("  %f MSamples/sec\n\n", rt * 1E-6);
696 
697     printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
698     rt = bench_shift_limited_unroll_inp(B, N);
699     printf("  %f MSamples/sec\n\n", rt * 1E-6);
700 
701     if ( have_sse_shift_mixer_impl() )
702     {
703         printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
704         rt = bench_shift_limited_unroll_A_sse_inp(B, N);
705         printf("  %f MSamples/sec\n\n", rt * 1E-6);
706 
707         printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
708         rt = bench_shift_limited_unroll_B_sse_inp(B, N);
709         printf("  %f MSamples/sec\n\n", rt * 1E-6);
710 
711         printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
712         rt = bench_shift_limited_unroll_C_sse_inp(B, N);
713         printf("  %f MSamples/sec\n\n", rt * 1E-6);
714     }
715 
716     printf("starting bench of shift_recursive_osc_cc in-place ..\n");
717     rt = bench_shift_rec_osc_cc_inp(B, N);
718     printf("  %f MSamples/sec\n\n", rt * 1E-6);
719 
720     if ( have_sse_shift_mixer_impl() )
721     {
722         printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
723         rt = bench_shift_rec_osc_sse_c_inp(B, N);
724         printf("  %f MSamples/sec\n\n", rt * 1E-6);
725     }
726 #endif
727 
728     return 0;
729 }
730 
731