1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack/AlignedAllocator.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/hswish.h>
18 #include <xnnpack/params.h>
19 #include <xnnpack/params-init.h>
20 
21 
f32_hswish(benchmark::State & state,xnn_f32_hswish_ukernel_function hswish,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void f32_hswish(
23   benchmark::State& state,
24   xnn_f32_hswish_ukernel_function hswish,
25   benchmark::utils::IsaCheckFunction isa_check = nullptr)
26 {
27   if (isa_check && !isa_check(state)) {
28     return;
29   }
30 
31   const size_t elements = state.range(0);
32   std::vector<float, AlignedAllocator<float, 64>> input(elements);
33   std::vector<float, AlignedAllocator<float, 64>> output(elements);
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
38   std::generate(input.begin(), input.end(), std::ref(f32rng));
39   std::fill(output.begin(), output.end(), std::nanf(""));
40 
41   const union xnn_f32_hswish_params params = xnn_init_f32_hswish_params();
42   for (auto _ : state) {
43     hswish(elements * sizeof(float), input.data(), output.data(), &params);
44   }
45 
46   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
47   if (cpu_frequency != 0) {
48     state.counters["cpufreq"] = cpu_frequency;
49   }
50 
51   const size_t elements_per_iteration = elements;
52   state.counters["elements"] =
53     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
54 
55   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
56   state.counters["bytes"] =
57     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
58 }
59 
60 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
61   BENCHMARK_CAPTURE(f32_hswish, neon_x4, xnn_f32_hswish_ukernel__neon_x4, benchmark::utils::CheckNEON)
62     ->RangeMultiplier(10)
63     ->Range(1000, 1000000)
64     ->UseRealTime();
65   BENCHMARK_CAPTURE(f32_hswish, neon_x8, xnn_f32_hswish_ukernel__neon_x8, benchmark::utils::CheckNEON)
66     ->RangeMultiplier(10)
67     ->Range(1000, 1000000)
68     ->UseRealTime();
69   BENCHMARK_CAPTURE(f32_hswish, neon_x16, xnn_f32_hswish_ukernel__neon_x16, benchmark::utils::CheckNEON)
70     ->RangeMultiplier(10)
71     ->Range(1000, 1000000)
72     ->UseRealTime();
73 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
74 
75 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
76   BENCHMARK_CAPTURE(f32_hswish, sse_x4, xnn_f32_hswish_ukernel__sse_x4)
77     ->RangeMultiplier(10)
78     ->Range(1000, 1000000)
79     ->UseRealTime();
80   BENCHMARK_CAPTURE(f32_hswish, sse_x8, xnn_f32_hswish_ukernel__sse_x8)
81     ->RangeMultiplier(10)
82     ->Range(1000, 1000000)
83     ->UseRealTime();
84 
85   BENCHMARK_CAPTURE(f32_hswish, avx_x8, xnn_f32_hswish_ukernel__avx_x8, benchmark::utils::CheckAVX)
86     ->RangeMultiplier(10)
87     ->Range(1000, 1000000)
88     ->UseRealTime();
89   BENCHMARK_CAPTURE(f32_hswish, avx_x16, xnn_f32_hswish_ukernel__avx_x16, benchmark::utils::CheckAVX)
90     ->RangeMultiplier(10)
91     ->Range(1000, 1000000)
92     ->UseRealTime();
93 
94   BENCHMARK_CAPTURE(f32_hswish, fma3_x8, xnn_f32_hswish_ukernel__fma3_x8, benchmark::utils::CheckFMA3)
95     ->RangeMultiplier(10)
96     ->Range(1000, 1000000)
97     ->UseRealTime();
98   BENCHMARK_CAPTURE(f32_hswish, fma3_x16, xnn_f32_hswish_ukernel__fma3_x16, benchmark::utils::CheckFMA3)
99     ->RangeMultiplier(10)
100     ->Range(1000, 1000000)
101     ->UseRealTime();
102 
103   BENCHMARK_CAPTURE(f32_hswish, avx512f_x16, xnn_f32_hswish_ukernel__avx512f_x16, benchmark::utils::CheckAVX512F)
104     ->RangeMultiplier(10)
105     ->Range(1000, 1000000)
106     ->UseRealTime();
107   BENCHMARK_CAPTURE(f32_hswish, avx512f_x32, xnn_f32_hswish_ukernel__avx512f_x32, benchmark::utils::CheckAVX512F)
108     ->RangeMultiplier(10)
109     ->Range(1000, 1000000)
110     ->UseRealTime();
111 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
112 
113 #if XNN_ARCH_WASMSIMD
114   BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x4, xnn_f32_hswish_ukernel__wasmsimd_x4)
115     ->RangeMultiplier(10)
116     ->Range(1000, 1000000)
117     ->UseRealTime();
118   BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x8, xnn_f32_hswish_ukernel__wasmsimd_x8)
119     ->RangeMultiplier(10)
120     ->Range(1000, 1000000)
121     ->UseRealTime();
122   BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x16, xnn_f32_hswish_ukernel__wasmsimd_x16)
123     ->RangeMultiplier(10)
124     ->Range(1000, 1000000)
125     ->UseRealTime();
126 #endif  // XNN_ARCH_WASMSIMD
127 
128 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
129   BENCHMARK_CAPTURE(f32_hswish, wasm_x1, xnn_f32_hswish_ukernel__wasm_x1)
130     ->RangeMultiplier(10)
131     ->Range(1000, 1000000)
132     ->UseRealTime();
133   BENCHMARK_CAPTURE(f32_hswish, wasm_x2, xnn_f32_hswish_ukernel__wasm_x2)
134     ->RangeMultiplier(10)
135     ->Range(1000, 1000000)
136     ->UseRealTime();
137   BENCHMARK_CAPTURE(f32_hswish, wasm_x4, xnn_f32_hswish_ukernel__wasm_x4)
138     ->RangeMultiplier(10)
139     ->Range(1000, 1000000)
140     ->UseRealTime();
141 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
142 
143 BENCHMARK_CAPTURE(f32_hswish, scalar_x1, xnn_f32_hswish_ukernel__scalar_x1)
144   ->RangeMultiplier(10)
145   ->Range(1000, 1000000)
146   ->UseRealTime();
147 BENCHMARK_CAPTURE(f32_hswish, scalar_x2, xnn_f32_hswish_ukernel__scalar_x2)
148   ->RangeMultiplier(10)
149   ->Range(1000, 1000000)
150   ->UseRealTime();
151 BENCHMARK_CAPTURE(f32_hswish, scalar_x4, xnn_f32_hswish_ukernel__scalar_x4)
152   ->RangeMultiplier(10)
153   ->Range(1000, 1000000)
154   ->UseRealTime();
155 
156 #ifndef XNNPACK_BENCHMARK_NO_MAIN
157 BENCHMARK_MAIN();
158 #endif
159