1 /* 2 * Microbenchmark for math functions. 3 * 4 * Copyright (c) 2018, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8 #undef _GNU_SOURCE 9 #define _GNU_SOURCE 1 10 #include <stdint.h> 11 #include <stdlib.h> 12 #include <stdio.h> 13 #include <string.h> 14 #include <time.h> 15 #include <math.h> 16 #include "mathlib.h" 17 18 #ifndef WANT_VMATH 19 /* Enable the build of vector math code. */ 20 # define WANT_VMATH 1 21 #endif 22 23 /* Number of measurements, best result is reported. */ 24 #define MEASURE 60 25 /* Array size. */ 26 #define N 8000 27 /* Iterations over the array. */ 28 #define ITER 125 29 30 static double *Trace; 31 static size_t trace_size; 32 static double A[N]; 33 static float Af[N]; 34 static long measurecount = MEASURE; 35 static long itercount = ITER; 36 37 #if __aarch64__ && WANT_VMATH 38 typedef __f64x2_t v_double; 39 40 #define v_double_len() 2 41 42 static inline v_double 43 v_double_load (const double *p) 44 { 45 return (v_double){p[0], p[1]}; 46 } 47 48 static inline v_double 49 v_double_dup (double x) 50 { 51 return (v_double){x, x}; 52 } 53 54 typedef __f32x4_t v_float; 55 56 #define v_float_len() 4 57 58 static inline v_float 59 v_float_load (const float *p) 60 { 61 return (v_float){p[0], p[1], p[2], p[3]}; 62 } 63 64 static inline v_float 65 v_float_dup (float x) 66 { 67 return (v_float){x, x, x, x}; 68 } 69 #else 70 /* dummy definitions to make things compile. */ 71 typedef double v_double; 72 typedef float v_float; 73 #define v_double_len(x) 1 74 #define v_double_load(x) (x)[0] 75 #define v_double_dup(x) (x) 76 #define v_float_len(x) 1 77 #define v_float_load(x) (x)[0] 78 #define v_float_dup(x) (x) 79 #endif 80 81 static double 82 dummy (double x) 83 { 84 return x; 85 } 86 87 static float 88 dummyf (float x) 89 { 90 return x; 91 } 92 93 #if WANT_VMATH 94 #if __aarch64__ 95 static v_double 96 __v_dummy (v_double x) 97 { 98 return x; 99 } 100 101 static v_float 102 __v_dummyf (v_float x) 103 { 104 return x; 105 } 106 107 #ifdef __vpcs 108 __vpcs static v_double 109 __vn_dummy (v_double x) 110 { 111 return x; 112 } 113 114 __vpcs static v_float 115 __vn_dummyf (v_float x) 116 { 117 return x; 118 } 119 120 __vpcs static v_float 121 xy__vn_powf (v_float x) 122 { 123 return __vn_powf (x, x); 124 } 125 126 __vpcs static v_float 127 xy_Z_powf (v_float x) 128 { 129 return _ZGVnN4vv_powf (x, x); 130 } 131 132 __vpcs static v_double 133 xy__vn_pow (v_double x) 134 { 135 return __vn_pow (x, x); 136 } 137 138 __vpcs static v_double 139 xy_Z_pow (v_double x) 140 { 141 return _ZGVnN2vv_pow (x, x); 142 } 143 #endif 144 145 static v_float 146 xy__v_powf (v_float x) 147 { 148 return __v_powf (x, x); 149 } 150 151 static v_double 152 xy__v_pow (v_double x) 153 { 154 return __v_pow (x, x); 155 } 156 #endif 157 158 static float 159 xy__s_powf (float x) 160 { 161 return __s_powf (x, x); 162 } 163 164 static double 165 xy__s_pow (double x) 166 { 167 return __s_pow (x, x); 168 } 169 #endif 170 171 static double 172 xypow (double x) 173 { 174 return pow (x, x); 175 } 176 177 static float 178 xypowf (float x) 179 { 180 return powf (x, x); 181 } 182 183 static double 184 xpow (double x) 185 { 186 return pow (x, 23.4); 187 } 188 189 static float 190 xpowf (float x) 191 { 192 return powf (x, 23.4f); 193 } 194 195 static double 196 ypow (double x) 197 { 198 return pow (2.34, x); 199 } 200 201 static float 202 ypowf (float x) 203 { 204 return powf (2.34f, x); 205 } 206 207 static float 208 sincosf_wrap (float x) 209 { 210 float s, c; 211 sincosf (x, &s, &c); 212 return s + c; 213 } 214 215 static const struct fun 216 { 217 const char *name; 218 int prec; 219 int vec; 220 double lo; 221 double hi; 222 union 223 { 224 double (*d) (double); 225 float (*f) (float); 226 v_double (*vd) (v_double); 227 v_float (*vf) (v_float); 228 #ifdef __vpcs 229 __vpcs v_double (*vnd) (v_double); 230 __vpcs v_float (*vnf) (v_float); 231 #endif 232 } fun; 233 } funtab[] = { 234 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, 235 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, 236 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, 237 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, 238 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, 239 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, 240 D (dummy, 1.0, 2.0) 241 D (exp, -9.9, 9.9) 242 D (exp, 0.5, 1.0) 243 D (exp2, -9.9, 9.9) 244 D (log, 0.01, 11.1) 245 D (log, 0.999, 1.001) 246 D (log2, 0.01, 11.1) 247 D (log2, 0.999, 1.001) 248 {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, 249 D (xpow, 0.01, 11.1) 250 D (ypow, -9.9, 9.9) 251 252 F (dummyf, 1.0, 2.0) 253 F (expf, -9.9, 9.9) 254 F (exp2f, -9.9, 9.9) 255 F (logf, 0.01, 11.1) 256 F (log2f, 0.01, 11.1) 257 {"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, 258 F (xpowf, 0.01, 11.1) 259 F (ypowf, -9.9, 9.9) 260 {"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, 261 {"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, 262 {"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, 263 {"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, 264 {"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, 265 {"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, 266 F (sinf, 0.1, 0.7) 267 F (sinf, 0.8, 3.1) 268 F (sinf, -3.1, 3.1) 269 F (sinf, 3.3, 33.3) 270 F (sinf, 100, 1000) 271 F (sinf, 1e6, 1e32) 272 F (cosf, 0.1, 0.7) 273 F (cosf, 0.8, 3.1) 274 F (cosf, -3.1, 3.1) 275 F (cosf, 3.3, 33.3) 276 F (cosf, 100, 1000) 277 F (cosf, 1e6, 1e32) 278 #if WANT_VMATH 279 D (__s_sin, -3.1, 3.1) 280 D (__s_cos, -3.1, 3.1) 281 D (__s_exp, -9.9, 9.9) 282 D (__s_log, 0.01, 11.1) 283 {"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, 284 F (__s_expf, -9.9, 9.9) 285 F (__s_expf_1u, -9.9, 9.9) 286 F (__s_exp2f, -9.9, 9.9) 287 F (__s_exp2f_1u, -9.9, 9.9) 288 F (__s_logf, 0.01, 11.1) 289 {"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, 290 F (__s_sinf, -3.1, 3.1) 291 F (__s_cosf, -3.1, 3.1) 292 #if __aarch64__ 293 VD (__v_dummy, 1.0, 2.0) 294 VD (__v_sin, -3.1, 3.1) 295 VD (__v_cos, -3.1, 3.1) 296 VD (__v_exp, -9.9, 9.9) 297 VD (__v_log, 0.01, 11.1) 298 {"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, 299 VF (__v_dummyf, 1.0, 2.0) 300 VF (__v_expf, -9.9, 9.9) 301 VF (__v_expf_1u, -9.9, 9.9) 302 VF (__v_exp2f, -9.9, 9.9) 303 VF (__v_exp2f_1u, -9.9, 9.9) 304 VF (__v_logf, 0.01, 11.1) 305 {"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, 306 VF (__v_sinf, -3.1, 3.1) 307 VF (__v_cosf, -3.1, 3.1) 308 #ifdef __vpcs 309 VND (__vn_dummy, 1.0, 2.0) 310 VND (__vn_exp, -9.9, 9.9) 311 VND (_ZGVnN2v_exp, -9.9, 9.9) 312 VND (__vn_log, 0.01, 11.1) 313 VND (_ZGVnN2v_log, 0.01, 11.1) 314 {"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, 315 {"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, 316 VND (__vn_sin, -3.1, 3.1) 317 VND (_ZGVnN2v_sin, -3.1, 3.1) 318 VND (__vn_cos, -3.1, 3.1) 319 VND (_ZGVnN2v_cos, -3.1, 3.1) 320 VNF (__vn_dummyf, 1.0, 2.0) 321 VNF (__vn_expf, -9.9, 9.9) 322 VNF (_ZGVnN4v_expf, -9.9, 9.9) 323 VNF (__vn_expf_1u, -9.9, 9.9) 324 VNF (__vn_exp2f, -9.9, 9.9) 325 VNF (_ZGVnN4v_exp2f, -9.9, 9.9) 326 VNF (__vn_exp2f_1u, -9.9, 9.9) 327 VNF (__vn_logf, 0.01, 11.1) 328 VNF (_ZGVnN4v_logf, 0.01, 11.1) 329 {"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, 330 {"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, 331 VNF (__vn_sinf, -3.1, 3.1) 332 VNF (_ZGVnN4v_sinf, -3.1, 3.1) 333 VNF (__vn_cosf, -3.1, 3.1) 334 VNF (_ZGVnN4v_cosf, -3.1, 3.1) 335 #endif 336 #endif 337 #endif 338 {0}, 339 #undef F 340 #undef D 341 #undef VF 342 #undef VD 343 #undef VNF 344 #undef VND 345 }; 346 347 static void 348 gen_linear (double lo, double hi) 349 { 350 for (int i = 0; i < N; i++) 351 A[i] = (lo * (N - i) + hi * i) / N; 352 } 353 354 static void 355 genf_linear (double lo, double hi) 356 { 357 for (int i = 0; i < N; i++) 358 Af[i] = (float)(lo * (N - i) + hi * i) / N; 359 } 360 361 static inline double 362 asdouble (uint64_t i) 363 { 364 union 365 { 366 uint64_t i; 367 double f; 368 } u = {i}; 369 return u.f; 370 } 371 372 static uint64_t seed = 0x0123456789abcdef; 373 374 static double 375 frand (double lo, double hi) 376 { 377 seed = 6364136223846793005ULL * seed + 1; 378 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); 379 } 380 381 static void 382 gen_rand (double lo, double hi) 383 { 384 for (int i = 0; i < N; i++) 385 A[i] = frand (lo, hi); 386 } 387 388 static void 389 genf_rand (double lo, double hi) 390 { 391 for (int i = 0; i < N; i++) 392 Af[i] = (float)frand (lo, hi); 393 } 394 395 static void 396 gen_trace (int index) 397 { 398 for (int i = 0; i < N; i++) 399 A[i] = Trace[index + i]; 400 } 401 402 static void 403 genf_trace (int index) 404 { 405 for (int i = 0; i < N; i++) 406 Af[i] = (float)Trace[index + i]; 407 } 408 409 static void 410 run_thruput (double f (double)) 411 { 412 for (int i = 0; i < N; i++) 413 f (A[i]); 414 } 415 416 static void 417 runf_thruput (float f (float)) 418 { 419 for (int i = 0; i < N; i++) 420 f (Af[i]); 421 } 422 423 volatile double zero = 0; 424 425 static void 426 run_latency (double f (double)) 427 { 428 double z = zero; 429 double prev = z; 430 for (int i = 0; i < N; i++) 431 prev = f (A[i] + prev * z); 432 } 433 434 static void 435 runf_latency (float f (float)) 436 { 437 float z = (float)zero; 438 float prev = z; 439 for (int i = 0; i < N; i++) 440 prev = f (Af[i] + prev * z); 441 } 442 443 static void 444 run_v_thruput (v_double f (v_double)) 445 { 446 for (int i = 0; i < N; i += v_double_len ()) 447 f (v_double_load (A+i)); 448 } 449 450 static void 451 runf_v_thruput (v_float f (v_float)) 452 { 453 for (int i = 0; i < N; i += v_float_len ()) 454 f (v_float_load (Af+i)); 455 } 456 457 static void 458 run_v_latency (v_double f (v_double)) 459 { 460 v_double z = v_double_dup (zero); 461 v_double prev = z; 462 for (int i = 0; i < N; i += v_double_len ()) 463 prev = f (v_double_load (A+i) + prev * z); 464 } 465 466 static void 467 runf_v_latency (v_float f (v_float)) 468 { 469 v_float z = v_float_dup (zero); 470 v_float prev = z; 471 for (int i = 0; i < N; i += v_float_len ()) 472 prev = f (v_float_load (Af+i) + prev * z); 473 } 474 475 #ifdef __vpcs 476 static void 477 run_vn_thruput (__vpcs v_double f (v_double)) 478 { 479 for (int i = 0; i < N; i += v_double_len ()) 480 f (v_double_load (A+i)); 481 } 482 483 static void 484 runf_vn_thruput (__vpcs v_float f (v_float)) 485 { 486 for (int i = 0; i < N; i += v_float_len ()) 487 f (v_float_load (Af+i)); 488 } 489 490 static void 491 run_vn_latency (__vpcs v_double f (v_double)) 492 { 493 v_double z = v_double_dup (zero); 494 v_double prev = z; 495 for (int i = 0; i < N; i += v_double_len ()) 496 prev = f (v_double_load (A+i) + prev * z); 497 } 498 499 static void 500 runf_vn_latency (__vpcs v_float f (v_float)) 501 { 502 v_float z = v_float_dup (zero); 503 v_float prev = z; 504 for (int i = 0; i < N; i += v_float_len ()) 505 prev = f (v_float_load (Af+i) + prev * z); 506 } 507 #endif 508 509 static uint64_t 510 tic (void) 511 { 512 struct timespec ts; 513 if (clock_gettime (CLOCK_REALTIME, &ts)) 514 abort (); 515 return ts.tv_sec * 1000000000ULL + ts.tv_nsec; 516 } 517 518 #define TIMEIT(run, f) do { \ 519 dt = -1; \ 520 run (f); /* Warm up. */ \ 521 for (int j = 0; j < measurecount; j++) \ 522 { \ 523 uint64_t t0 = tic (); \ 524 for (int i = 0; i < itercount; i++) \ 525 run (f); \ 526 uint64_t t1 = tic (); \ 527 if (t1 - t0 < dt) \ 528 dt = t1 - t0; \ 529 } \ 530 } while (0) 531 532 static void 533 bench1 (const struct fun *f, int type, double lo, double hi) 534 { 535 uint64_t dt = 0; 536 uint64_t ns100; 537 const char *s = type == 't' ? "rthruput" : "latency"; 538 int vlen = 1; 539 540 if (f->vec && f->prec == 'd') 541 vlen = v_double_len(); 542 else if (f->vec && f->prec == 'f') 543 vlen = v_float_len(); 544 545 if (f->prec == 'd' && type == 't' && f->vec == 0) 546 TIMEIT (run_thruput, f->fun.d); 547 else if (f->prec == 'd' && type == 'l' && f->vec == 0) 548 TIMEIT (run_latency, f->fun.d); 549 else if (f->prec == 'f' && type == 't' && f->vec == 0) 550 TIMEIT (runf_thruput, f->fun.f); 551 else if (f->prec == 'f' && type == 'l' && f->vec == 0) 552 TIMEIT (runf_latency, f->fun.f); 553 else if (f->prec == 'd' && type == 't' && f->vec == 'v') 554 TIMEIT (run_v_thruput, f->fun.vd); 555 else if (f->prec == 'd' && type == 'l' && f->vec == 'v') 556 TIMEIT (run_v_latency, f->fun.vd); 557 else if (f->prec == 'f' && type == 't' && f->vec == 'v') 558 TIMEIT (runf_v_thruput, f->fun.vf); 559 else if (f->prec == 'f' && type == 'l' && f->vec == 'v') 560 TIMEIT (runf_v_latency, f->fun.vf); 561 #ifdef __vpcs 562 else if (f->prec == 'd' && type == 't' && f->vec == 'n') 563 TIMEIT (run_vn_thruput, f->fun.vnd); 564 else if (f->prec == 'd' && type == 'l' && f->vec == 'n') 565 TIMEIT (run_vn_latency, f->fun.vnd); 566 else if (f->prec == 'f' && type == 't' && f->vec == 'n') 567 TIMEIT (runf_vn_thruput, f->fun.vnf); 568 else if (f->prec == 'f' && type == 'l' && f->vec == 'n') 569 TIMEIT (runf_vn_latency, f->fun.vnf); 570 #endif 571 572 if (type == 't') 573 { 574 ns100 = (100 * dt + itercount * N / 2) / (itercount * N); 575 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, 576 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 577 (unsigned long long) dt, lo, hi); 578 } 579 else if (type == 'l') 580 { 581 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); 582 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, 583 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 584 (unsigned long long) dt, lo, hi); 585 } 586 fflush (stdout); 587 } 588 589 static void 590 bench (const struct fun *f, double lo, double hi, int type, int gen) 591 { 592 if (f->prec == 'd' && gen == 'r') 593 gen_rand (lo, hi); 594 else if (f->prec == 'd' && gen == 'l') 595 gen_linear (lo, hi); 596 else if (f->prec == 'd' && gen == 't') 597 gen_trace (0); 598 else if (f->prec == 'f' && gen == 'r') 599 genf_rand (lo, hi); 600 else if (f->prec == 'f' && gen == 'l') 601 genf_linear (lo, hi); 602 else if (f->prec == 'f' && gen == 't') 603 genf_trace (0); 604 605 if (gen == 't') 606 hi = trace_size / N; 607 608 if (type == 'b' || type == 't') 609 bench1 (f, 't', lo, hi); 610 611 if (type == 'b' || type == 'l') 612 bench1 (f, 'l', lo, hi); 613 614 for (int i = N; i < trace_size; i += N) 615 { 616 if (f->prec == 'd') 617 gen_trace (i); 618 else 619 genf_trace (i); 620 621 lo = i / N; 622 if (type == 'b' || type == 't') 623 bench1 (f, 't', lo, hi); 624 625 if (type == 'b' || type == 'l') 626 bench1 (f, 'l', lo, hi); 627 } 628 } 629 630 static void 631 readtrace (const char *name) 632 { 633 int n = 0; 634 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); 635 if (!f) 636 { 637 printf ("openning \"%s\" failed: %m\n", name); 638 exit (1); 639 } 640 for (;;) 641 { 642 if (n >= trace_size) 643 { 644 trace_size += N; 645 Trace = realloc (Trace, trace_size * sizeof (Trace[0])); 646 if (Trace == NULL) 647 { 648 printf ("out of memory\n"); 649 exit (1); 650 } 651 } 652 if (fscanf (f, "%lf", Trace + n) != 1) 653 break; 654 n++; 655 } 656 if (ferror (f) || n == 0) 657 { 658 printf ("reading \"%s\" failed: %m\n", name); 659 exit (1); 660 } 661 fclose (f); 662 if (n % N == 0) 663 trace_size = n; 664 for (int i = 0; n < trace_size; n++, i++) 665 Trace[n] = Trace[i]; 666 } 667 668 static void 669 usage (void) 670 { 671 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " 672 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " 673 "[func2 ..]\n"); 674 printf ("func:\n"); 675 printf ("%7s [run all benchmarks]\n", "all"); 676 for (const struct fun *f = funtab; f->name; f++) 677 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); 678 exit (1); 679 } 680 681 int 682 main (int argc, char *argv[]) 683 { 684 int usergen = 0, gen = 'r', type = 'b', all = 0; 685 double lo = 0, hi = 0; 686 const char *tracefile = "-"; 687 688 argv++; 689 argc--; 690 for (;;) 691 { 692 if (argc <= 0) 693 usage (); 694 if (argv[0][0] != '-') 695 break; 696 else if (argc >= 3 && strcmp (argv[0], "-i") == 0) 697 { 698 usergen = 1; 699 lo = strtod (argv[1], 0); 700 hi = strtod (argv[2], 0); 701 argv += 3; 702 argc -= 3; 703 } 704 else if (argc >= 2 && strcmp (argv[0], "-m") == 0) 705 { 706 measurecount = strtol (argv[1], 0, 0); 707 argv += 2; 708 argc -= 2; 709 } 710 else if (argc >= 2 && strcmp (argv[0], "-c") == 0) 711 { 712 itercount = strtol (argv[1], 0, 0); 713 argv += 2; 714 argc -= 2; 715 } 716 else if (argc >= 2 && strcmp (argv[0], "-g") == 0) 717 { 718 gen = argv[1][0]; 719 if (strchr ("rlt", gen) == 0) 720 usage (); 721 argv += 2; 722 argc -= 2; 723 } 724 else if (argc >= 2 && strcmp (argv[0], "-f") == 0) 725 { 726 gen = 't'; /* -f implies -g trace. */ 727 tracefile = argv[1]; 728 argv += 2; 729 argc -= 2; 730 } 731 else if (argc >= 2 && strcmp (argv[0], "-t") == 0) 732 { 733 type = argv[1][0]; 734 if (strchr ("ltb", type) == 0) 735 usage (); 736 argv += 2; 737 argc -= 2; 738 } 739 else 740 usage (); 741 } 742 if (gen == 't') 743 { 744 readtrace (tracefile); 745 lo = hi = 0; 746 usergen = 1; 747 } 748 while (argc > 0) 749 { 750 int found = 0; 751 all = strcmp (argv[0], "all") == 0; 752 for (const struct fun *f = funtab; f->name; f++) 753 if (all || strcmp (argv[0], f->name) == 0) 754 { 755 found = 1; 756 if (!usergen) 757 { 758 lo = f->lo; 759 hi = f->hi; 760 } 761 bench (f, lo, hi, type, gen); 762 if (usergen && !all) 763 break; 764 } 765 if (!found) 766 printf ("unknown function: %s\n", argv[0]); 767 argv++; 768 argc--; 769 } 770 return 0; 771 } 772