1 /*
2 Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de )
3
4 bench for mixer algorithm/implementations
5
6 */
7
8 #include <pf_mixer.h>
9
10 #include <math.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <time.h>
14 #include <assert.h>
15 #include <string.h>
16
17 #define HAVE_SYS_TIMES
18
19 #ifdef HAVE_SYS_TIMES
20 # include <sys/times.h>
21 # include <unistd.h>
22 #endif
23
24 #define BENCH_REF_TRIG_FUNC 1
25 #define BENCH_OUT_OF_PLACE_ALGOS 0
26 #define BENCH_INPLACE_ALGOS 1
27
28 #define SAVE_BY_DEFAULT 0
29 #define SAVE_LIMIT_MSPS 16
30
31 #if 0
32 #define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
33 #define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
34 #define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
35 #define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
36 #define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
37 #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
38 #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
39 #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
40 #define BENCH_FILE_REC_OSC_CC ""
41 #define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
42 #define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
43 #else
44 #define BENCH_FILE_SHIFT_MATH_CC ""
45 #define BENCH_FILE_ADD_FAST_CC ""
46 #define BENCH_FILE_ADD_FAST_INP_C ""
47 #define BENCH_FILE_UNROLL_INP_C ""
48 #define BENCH_FILE_LTD_UNROLL_INP_C ""
49 #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C ""
50 #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C ""
51 #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C ""
52 #define BENCH_FILE_REC_OSC_CC ""
53 #define BENCH_FILE_REC_OSC_INP_C ""
54 #define BENCH_FILE_REC_OSC_SSE_INP_C ""
55 #endif
56
57
58
59 #if defined(HAVE_SYS_TIMES)
60 static double ttclk = 0.;
61
uclock_sec(int find_start)62 static double uclock_sec(int find_start)
63 {
64 struct tms t0, t;
65 if (ttclk == 0.)
66 {
67 ttclk = sysconf(_SC_CLK_TCK);
68 fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
69 }
70 times(&t);
71 if (find_start)
72 {
73 t0 = t;
74 while (t0.tms_utime == t.tms_utime)
75 times(&t);
76 }
77 /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
78 return ((double)t.tms_utime) / ttclk;
79 }
80
81 #elif 0
82 // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
uclock_sec(int find_start)83 double uclock_sec(int find_start)
84 {
85 FILETIME a, b, c, d;
86 if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
87 {
88 // Returns total user time.
89 // Can be tweaked to include kernel times as well.
90 return
91 (double)(d.dwLowDateTime |
92 ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
93 }
94 else {
95 // Handle error
96 return 0;
97 }
98 }
99
100 #else
uclock_sec(int find_start)101 double uclock_sec(int find_start)
102 { return (double)clock()/(double)CLOCKS_PER_SEC; }
103 #endif
104
105
save(complexf * d,int B,int N,const char * fn)106 void save(complexf * d, int B, int N, const char * fn)
107 {
108 if (!fn || !fn[0])
109 {
110 if (! SAVE_BY_DEFAULT)
111 return;
112 fn = "/dev/shm/bench.bin";
113 }
114 FILE * f = fopen(fn, "wb");
115 if (!f) {
116 fprintf(stderr, "error writing result to %s\n", fn);
117 return;
118 }
119 if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
120 N = SAVE_LIMIT_MSPS * 1024 * 1024;
121 for (int off = 0; off + B <= N; off += B)
122 {
123 fwrite(d+off, sizeof(complexf), B, f);
124 }
125 fclose(f);
126 }
127
128
bench_shift_math_cc(int B,int N)129 double bench_shift_math_cc(int B, int N) {
130 double t0, t1, tstop, T, nI;
131 int iter, off;
132 float phase = 0.0F;
133 complexf *input = (complexf *)malloc(N * sizeof(complexf));
134 complexf *output = (complexf *)malloc(N * sizeof(complexf));
135 shift_recursive_osc_t gen_state;
136 shift_recursive_osc_conf_t gen_conf;
137
138 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
139 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
140
141 iter = 0;
142 off = 0;
143 t0 = uclock_sec(1);
144 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
145 do {
146 // work
147 phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
148 off += B;
149 ++iter;
150 t1 = uclock_sec(0);
151 } while ( t1 < tstop && off + B < N );
152
153 save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);
154
155 free(input);
156 free(output);
157 T = ( t1 - t0 ); /* duration per fft() */
158 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
159 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
160 return (nI / T); /* normalized iterations per second */
161 }
162
163
bench_shift_table_cc(int B,int N)164 double bench_shift_table_cc(int B, int N) {
165 double t0, t1, tstop, T, nI;
166 int iter, off;
167 int table_size=65536;
168 float phase = 0.0F;
169 complexf *input = (complexf *)malloc(N * sizeof(complexf));
170 complexf *output = (complexf *)malloc(N * sizeof(complexf));
171 shift_recursive_osc_t gen_state;
172 shift_recursive_osc_conf_t gen_conf;
173
174 shift_table_data_t table_data = shift_table_init(table_size);
175
176 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
177 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
178
179 iter = 0;
180 off = 0;
181 t0 = uclock_sec(1);
182 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
183 do {
184 // work
185 phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
186
187 off += B;
188 ++iter;
189 t1 = uclock_sec(0);
190 } while ( t1 < tstop && off + B < N );
191
192 save(output, B, off, NULL);
193 free(input);
194 free(output);
195 T = ( t1 - t0 ); /* duration per fft() */
196 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
197 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
198 return (nI / T); /* normalized iterations per second */
199 }
200
201
bench_shift_addfast(int B,int N)202 double bench_shift_addfast(int B, int N) {
203 double t0, t1, tstop, T, nI;
204 int iter, off;
205 float phase = 0.0F;
206 complexf *input = (complexf *)malloc(N * sizeof(complexf));
207 complexf *output = (complexf *)malloc(N * sizeof(complexf));
208 shift_recursive_osc_t gen_state;
209 shift_recursive_osc_conf_t gen_conf;
210 shift_addfast_data_t state = shift_addfast_init(-0.0009F);
211
212 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
213 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
214
215 iter = 0;
216 off = 0;
217 t0 = uclock_sec(1);
218 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
219 do {
220 // work
221 phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
222
223 off += B;
224 ++iter;
225 t1 = uclock_sec(0);
226 } while ( t1 < tstop && off + B < N );
227
228 save(output, B, off, BENCH_FILE_ADD_FAST_CC);
229
230 free(input);
231 free(output);
232 T = ( t1 - t0 ); /* duration per fft() */
233 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
234 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
235 return (nI / T); /* normalized iterations per second */
236 }
237
bench_shift_addfast_inp(int B,int N)238 double bench_shift_addfast_inp(int B, int N) {
239 double t0, t1, tstop, T, nI;
240 int iter, off;
241 float phase = 0.0F;
242 complexf *input = (complexf *)malloc(N * sizeof(complexf));
243 shift_recursive_osc_t gen_state;
244 shift_recursive_osc_conf_t gen_conf;
245 shift_addfast_data_t state = shift_addfast_init(-0.0009F);
246
247 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
248 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
249
250 iter = 0;
251 off = 0;
252 t0 = uclock_sec(1);
253 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
254 do {
255 // work
256 phase = shift_addfast_inp_c(input+off, B, &state, phase);
257
258 off += B;
259 ++iter;
260 t1 = uclock_sec(0);
261 } while ( t1 < tstop && off + B < N );
262
263 save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);
264
265 free(input);
266 T = ( t1 - t0 ); /* duration per fft() */
267 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
268 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
269 return (nI / T); /* normalized iterations per second */
270 }
271
272
bench_shift_unroll_oop(int B,int N)273 double bench_shift_unroll_oop(int B, int N) {
274 double t0, t1, tstop, T, nI;
275 int iter, off;
276 float phase = 0.0F;
277 complexf *input = (complexf *)malloc(N * sizeof(complexf));
278 complexf *output = (complexf *)malloc(N * sizeof(complexf));
279 shift_recursive_osc_t gen_state;
280 shift_recursive_osc_conf_t gen_conf;
281 shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
282
283 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
284 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
285
286 iter = 0;
287 off = 0;
288 t0 = uclock_sec(1);
289 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
290 do {
291 // work
292 phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
293
294 off += B;
295 ++iter;
296 t1 = uclock_sec(0);
297 } while ( t1 < tstop && off + B < N );
298
299 save(output, B, off, NULL);
300 free(input);
301 free(output);
302 T = ( t1 - t0 ); /* duration per fft() */
303 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
304 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
305 return (nI / T); /* normalized iterations per second */
306 }
307
bench_shift_unroll_inp(int B,int N)308 double bench_shift_unroll_inp(int B, int N) {
309 double t0, t1, tstop, T, nI;
310 int iter, off;
311 float phase = 0.0F;
312 complexf *input = (complexf *)malloc(N * sizeof(complexf));
313 shift_recursive_osc_t gen_state;
314 shift_recursive_osc_conf_t gen_conf;
315 shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
316
317 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
318 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
319
320 iter = 0;
321 off = 0;
322 t0 = uclock_sec(1);
323 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
324 do {
325 // work
326 phase = shift_unroll_inp_c(input+off, B, &state, phase);
327
328 off += B;
329 ++iter;
330 t1 = uclock_sec(0);
331 } while ( t1 < tstop && off + B < N );
332
333 save(input, B, off, BENCH_FILE_UNROLL_INP_C);
334
335 free(input);
336 T = ( t1 - t0 ); /* duration per fft() */
337 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
338 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
339 return (nI / T); /* normalized iterations per second */
340 }
341
342
343
bench_shift_limited_unroll_oop(int B,int N)344 double bench_shift_limited_unroll_oop(int B, int N) {
345 double t0, t1, tstop, T, nI;
346 int iter, off;
347 complexf *input = (complexf *)malloc(N * sizeof(complexf));
348 complexf *output = (complexf *)malloc(N * sizeof(complexf));
349 shift_recursive_osc_t gen_state;
350 shift_recursive_osc_conf_t gen_conf;
351 shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
352
353 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
354 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
355
356 iter = 0;
357 off = 0;
358 t0 = uclock_sec(1);
359 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
360 do {
361 // work
362 shift_limited_unroll_cc(input+off, output+off, B, &state);
363
364 off += B;
365 ++iter;
366 t1 = uclock_sec(0);
367 } while ( t1 < tstop && off + B < N );
368
369 save(output, B, off, NULL);
370 free(input);
371 free(output);
372 T = ( t1 - t0 ); /* duration per fft() */
373 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
374 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
375 return (nI / T); /* normalized iterations per second */
376 }
377
378
bench_shift_limited_unroll_inp(int B,int N)379 double bench_shift_limited_unroll_inp(int B, int N) {
380 double t0, t1, tstop, T, nI;
381 int iter, off;
382 complexf *input = (complexf *)malloc(N * sizeof(complexf));
383 shift_recursive_osc_t gen_state;
384 shift_recursive_osc_conf_t gen_conf;
385 shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
386
387 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
388 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
389
390 iter = 0;
391 off = 0;
392 t0 = uclock_sec(1);
393 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
394 do {
395 // work
396 shift_limited_unroll_inp_c(input+off, B, &state);
397
398 off += B;
399 ++iter;
400 t1 = uclock_sec(0);
401 } while ( t1 < tstop && off + B < N );
402
403 save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);
404
405 free(input);
406 T = ( t1 - t0 ); /* duration per fft() */
407 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
408 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
409 return (nI / T); /* normalized iterations per second */
410 }
411
412
bench_shift_limited_unroll_A_sse_inp(int B,int N)413 double bench_shift_limited_unroll_A_sse_inp(int B, int N) {
414 double t0, t1, tstop, T, nI;
415 int iter, off;
416 complexf *input = (complexf *)malloc(N * sizeof(complexf));
417 shift_recursive_osc_t gen_state;
418 shift_recursive_osc_conf_t gen_conf;
419 shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t));
420
421 *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);
422
423 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
424 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
425
426 iter = 0;
427 off = 0;
428 t0 = uclock_sec(1);
429 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
430 do {
431 // work
432 shift_limited_unroll_A_sse_inp_c(input+off, B, state);
433
434 off += B;
435 ++iter;
436 t1 = uclock_sec(0);
437 } while ( t1 < tstop && off + B < N );
438
439 save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
440
441 free(input);
442 T = ( t1 - t0 ); /* duration per fft() */
443 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
444 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
445 return (nI / T); /* normalized iterations per second */
446 }
447
bench_shift_limited_unroll_B_sse_inp(int B,int N)448 double bench_shift_limited_unroll_B_sse_inp(int B, int N) {
449 double t0, t1, tstop, T, nI;
450 int iter, off;
451 complexf *input = (complexf *)malloc(N * sizeof(complexf));
452 shift_recursive_osc_t gen_state;
453 shift_recursive_osc_conf_t gen_conf;
454 shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t));
455
456 *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);
457
458 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
459 //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
460 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
461
462 iter = 0;
463 off = 0;
464 t0 = uclock_sec(1);
465 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
466 do {
467 // work
468 shift_limited_unroll_B_sse_inp_c(input+off, B, state);
469
470 off += B;
471 ++iter;
472 t1 = uclock_sec(0);
473 } while ( t1 < tstop && off + B < N );
474
475 save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
476
477 free(input);
478 T = ( t1 - t0 ); /* duration per fft() */
479 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
480 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
481 return (nI / T); /* normalized iterations per second */
482 }
483
bench_shift_limited_unroll_C_sse_inp(int B,int N)484 double bench_shift_limited_unroll_C_sse_inp(int B, int N) {
485 double t0, t1, tstop, T, nI;
486 int iter, off;
487 complexf *input = (complexf *)malloc(N * sizeof(complexf));
488 shift_recursive_osc_t gen_state;
489 shift_recursive_osc_conf_t gen_conf;
490 shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t));
491
492 *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);
493
494 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
495 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
496
497 iter = 0;
498 off = 0;
499 t0 = uclock_sec(1);
500 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
501 do {
502 // work
503 shift_limited_unroll_C_sse_inp_c(input+off, B, state);
504
505 off += B;
506 ++iter;
507 t1 = uclock_sec(0);
508 } while ( t1 < tstop && off + B < N );
509
510 save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
511
512 free(input);
513 T = ( t1 - t0 ); /* duration per fft() */
514 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
515 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
516 return (nI / T); /* normalized iterations per second */
517 }
518
519
bench_shift_rec_osc_cc_oop(int B,int N)520 double bench_shift_rec_osc_cc_oop(int B, int N) {
521 double t0, t1, tstop, T, nI;
522 int iter, off;
523 float phase = 0.0F;
524 complexf *input = (complexf *)malloc(N * sizeof(complexf));
525 complexf *output = (complexf *)malloc(N * sizeof(complexf));
526 shift_recursive_osc_t gen_state, shift_state;
527 shift_recursive_osc_conf_t gen_conf, shift_conf;
528
529 shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
530 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
531 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
532
533 iter = 0;
534 off = 0;
535 t0 = uclock_sec(1);
536 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
537 do {
538 // work
539 shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
540
541 off += B;
542 ++iter;
543 t1 = uclock_sec(0);
544 } while ( t1 < tstop && off + B < N );
545
546 save(input, B, off, BENCH_FILE_REC_OSC_CC);
547
548 save(output, B, off, NULL);
549 free(input);
550 free(output);
551 T = ( t1 - t0 ); /* duration per fft() */
552 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
553 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
554 return (nI / T); /* normalized iterations per second */
555 }
556
557
bench_shift_rec_osc_cc_inp(int B,int N)558 double bench_shift_rec_osc_cc_inp(int B, int N) {
559 double t0, t1, tstop, T, nI;
560 int iter, off;
561 float phase = 0.0F;
562 complexf *input = (complexf *)malloc(N * sizeof(complexf));
563 shift_recursive_osc_t gen_state, shift_state;
564 shift_recursive_osc_conf_t gen_conf, shift_conf;
565
566 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
567 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
568 shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
569
570 iter = 0;
571 off = 0;
572 t0 = uclock_sec(1);
573 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
574 do {
575 // work
576 shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state);
577
578 off += B;
579 ++iter;
580 t1 = uclock_sec(0);
581 } while ( t1 < tstop && off + B < N );
582
583 save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
584 free(input);
585 T = ( t1 - t0 ); /* duration per fft() */
586 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
587 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
588 return (nI / T); /* normalized iterations per second */
589 }
590
591
bench_shift_rec_osc_sse_c_inp(int B,int N)592 double bench_shift_rec_osc_sse_c_inp(int B, int N) {
593 double t0, t1, tstop, T, nI;
594 int iter, off;
595 float phase = 0.0F;
596 complexf *input = (complexf *)malloc(N * sizeof(complexf));
597 shift_recursive_osc_t gen_state;
598 shift_recursive_osc_conf_t gen_conf;
599
600 shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t));
601 shift_recursive_osc_sse_conf_t shift_conf;
602
603 shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
604 gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
605
606 shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
607
608 iter = 0;
609 off = 0;
610 t0 = uclock_sec(1);
611 tstop = t0 + 0.5; /* benchmark duration: 500 ms */
612 do {
613 // work
614 shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state);
615
616 off += B;
617 ++iter;
618 t1 = uclock_sec(0);
619 } while ( t1 < tstop && off + B < N );
620
621 save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
622 free(input);
623 T = ( t1 - t0 ); /* duration per fft() */
624 printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
625 nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */
626 return (nI / T); /* normalized iterations per second */
627 }
628
629
630
main(int argc,char ** argv)631 int main(int argc, char **argv)
632 {
633 double rt;
634
635 // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
636 int B = 8 * 1024;
637 int N = 64 * 1024 * 1024;
638 int showUsage = 0;
639
640 if (argc == 1)
641 showUsage = 1;
642
643 if (1 < argc)
644 B = atoi(argv[1]);
645 if (2 < argc)
646 N = atoi(argv[2]) * 1024 * 1024;
647
648 if ( !B || !N || showUsage )
649 {
650 fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
651 if ( !B || !N )
652 return 0;
653 }
654
655 fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n",
656 N / (1024 * 1024), B );
657
658
659 #if BENCH_REF_TRIG_FUNC
660 printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
661 rt = bench_shift_math_cc(B, N);
662 printf(" %f MSamples/sec\n\n", rt * 1E-6);
663 #endif
664
665 #if BENCH_OUT_OF_PLACE_ALGOS
666 printf("starting bench of shift_table_cc (out-of-place) ..\n");
667 rt = bench_shift_table_cc(B, N);
668 printf(" %f MSamples/sec\n\n", rt * 1E-6);
669
670 printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
671 rt = bench_shift_addfast(B, N);
672 printf(" %f MSamples/sec\n\n", rt * 1E-6);
673
674 printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
675 rt = bench_shift_unroll_oop(B, N);
676 printf(" %f MSamples/sec\n\n", rt * 1E-6);
677
678 printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
679 rt = bench_shift_limited_unroll_oop(B, N);
680 printf(" %f MSamples/sec\n\n", rt * 1E-6);
681
682 printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
683 rt = bench_shift_rec_osc_cc_oop(B, N);
684 printf(" %f MSamples/sec\n\n", rt * 1E-6);
685 #endif
686
687 #if BENCH_INPLACE_ALGOS
688
689 printf("starting bench of shift_addfast_inp_c in-place ..\n");
690 rt = bench_shift_addfast_inp(B, N);
691 printf(" %f MSamples/sec\n\n", rt * 1E-6);
692
693 printf("starting bench of shift_unroll_inp_c in-place ..\n");
694 rt = bench_shift_unroll_inp(B, N);
695 printf(" %f MSamples/sec\n\n", rt * 1E-6);
696
697 printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
698 rt = bench_shift_limited_unroll_inp(B, N);
699 printf(" %f MSamples/sec\n\n", rt * 1E-6);
700
701 if ( have_sse_shift_mixer_impl() )
702 {
703 printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
704 rt = bench_shift_limited_unroll_A_sse_inp(B, N);
705 printf(" %f MSamples/sec\n\n", rt * 1E-6);
706
707 printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
708 rt = bench_shift_limited_unroll_B_sse_inp(B, N);
709 printf(" %f MSamples/sec\n\n", rt * 1E-6);
710
711 printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
712 rt = bench_shift_limited_unroll_C_sse_inp(B, N);
713 printf(" %f MSamples/sec\n\n", rt * 1E-6);
714 }
715
716 printf("starting bench of shift_recursive_osc_cc in-place ..\n");
717 rt = bench_shift_rec_osc_cc_inp(B, N);
718 printf(" %f MSamples/sec\n\n", rt * 1E-6);
719
720 if ( have_sse_shift_mixer_impl() )
721 {
722 printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
723 rt = bench_shift_rec_osc_sse_c_inp(B, N);
724 printf(" %f MSamples/sec\n\n", rt * 1E-6);
725 }
726 #endif
727
728 return 0;
729 }
730
731