1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack/AlignedAllocator.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/vunary.h>
18 #include <xnnpack/params.h>
19 #include <xnnpack/params-init.h>
20 
21 
f32_vsqrt(benchmark::State & state,xnn_f32_vsqrt_ukernel_function vsqrt,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void f32_vsqrt(
23   benchmark::State& state,
24   xnn_f32_vsqrt_ukernel_function vsqrt,
25   benchmark::utils::IsaCheckFunction isa_check = nullptr)
26 {
27   if (isa_check && !isa_check(state)) {
28     return;
29   }
30 
31   const size_t elements = state.range(0);
32   std::vector<float, AlignedAllocator<float, 64>> input(elements);
33   std::vector<float, AlignedAllocator<float, 64>> output(elements);
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 10.0f), std::ref(rng));
38   std::generate(input.begin(), input.end(), std::ref(f32rng));
39   std::fill(output.begin(), output.end(), std::nanf(""));
40 
41   const union xnn_f32_sqrt_params params = xnn_init_f32_sqrt_params();
42   for (auto _ : state) {
43     vsqrt(elements * sizeof(float), input.data(), output.data(), &params);
44   }
45 
46   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
47   if (cpu_frequency != 0) {
48     state.counters["cpufreq"] = cpu_frequency;
49   }
50 
51   const size_t elements_per_iteration = elements;
52   state.counters["elements"] =
53     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
54 
55   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
56   state.counters["bytes"] =
57     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
58 }
59 
60 #if XNN_ARCH_ARM64
61   BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x4, xnn_f32_vsqrt_ukernel__neon_sqrt_x4)
62     ->RangeMultiplier(10)
63     ->Range(1000, 1000000)
64     ->UseRealTime();
65   BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x8, xnn_f32_vsqrt_ukernel__neon_sqrt_x8)
66     ->RangeMultiplier(10)
67     ->Range(1000, 1000000)
68     ->UseRealTime();
69 #endif  // XNN_ARCH_ARM64
70 
71 #if XNN_ARCH_ARM64 || XNN_ARCH_ARM64
72   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x4, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4, benchmark::utils::CheckNEONFMA)
73     ->RangeMultiplier(10)
74     ->Range(1000, 1000000)
75     ->UseRealTime();
76   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x8, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8, benchmark::utils::CheckNEONFMA)
77     ->RangeMultiplier(10)
78     ->Range(1000, 1000000)
79     ->UseRealTime();
80   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x12, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x12, benchmark::utils::CheckNEONFMA)
81     ->RangeMultiplier(10)
82     ->Range(1000, 1000000)
83     ->UseRealTime();
84   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x16, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16, benchmark::utils::CheckNEONFMA)
85     ->RangeMultiplier(10)
86     ->Range(1000, 1000000)
87     ->UseRealTime();
88   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x20, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20, benchmark::utils::CheckNEONFMA)
89     ->RangeMultiplier(10)
90     ->Range(1000, 1000000)
91     ->UseRealTime();
92   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x24, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24, benchmark::utils::CheckNEONFMA)
93     ->RangeMultiplier(10)
94     ->Range(1000, 1000000)
95     ->UseRealTime();
96   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x28, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28, benchmark::utils::CheckNEONFMA)
97     ->RangeMultiplier(10)
98     ->Range(1000, 1000000)
99     ->UseRealTime();
100   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x32, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32, benchmark::utils::CheckNEONFMA)
101     ->RangeMultiplier(10)
102     ->Range(1000, 1000000)
103     ->UseRealTime();
104   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x36, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36, benchmark::utils::CheckNEONFMA)
105     ->RangeMultiplier(10)
106     ->Range(1000, 1000000)
107     ->UseRealTime();
108   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x40, xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40, benchmark::utils::CheckNEONFMA)
109     ->RangeMultiplier(10)
110     ->Range(1000, 1000000)
111     ->UseRealTime();
112 
113   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x4, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4, benchmark::utils::CheckNEONFMA)
114     ->RangeMultiplier(10)
115     ->Range(1000, 1000000)
116     ->UseRealTime();
117   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x8, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8, benchmark::utils::CheckNEONFMA)
118     ->RangeMultiplier(10)
119     ->Range(1000, 1000000)
120     ->UseRealTime();
121   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x12, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12, benchmark::utils::CheckNEONFMA)
122     ->RangeMultiplier(10)
123     ->Range(1000, 1000000)
124     ->UseRealTime();
125   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x16, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16, benchmark::utils::CheckNEONFMA)
126     ->RangeMultiplier(10)
127     ->Range(1000, 1000000)
128     ->UseRealTime();
129   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x20, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20, benchmark::utils::CheckNEONFMA)
130     ->RangeMultiplier(10)
131     ->Range(1000, 1000000)
132     ->UseRealTime();
133   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x24, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24, benchmark::utils::CheckNEONFMA)
134     ->RangeMultiplier(10)
135     ->Range(1000, 1000000)
136     ->UseRealTime();
137   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x28, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28, benchmark::utils::CheckNEONFMA)
138     ->RangeMultiplier(10)
139     ->Range(1000, 1000000)
140     ->UseRealTime();
141   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x32, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32, benchmark::utils::CheckNEONFMA)
142     ->RangeMultiplier(10)
143     ->Range(1000, 1000000)
144     ->UseRealTime();
145   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x36, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36, benchmark::utils::CheckNEONFMA)
146     ->RangeMultiplier(10)
147     ->Range(1000, 1000000)
148     ->UseRealTime();
149   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x40, xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40, benchmark::utils::CheckNEONFMA)
150     ->RangeMultiplier(10)
151     ->Range(1000, 1000000)
152     ->UseRealTime();
153 #endif  // XNN_ARCH_ARM64 || XNN_ARCH_ARM64
154 
155 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
156   BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x4, xnn_f32_vsqrt_ukernel__sse_sqrt_x4)
157     ->RangeMultiplier(10)
158     ->Range(1000, 1000000)
159     ->UseRealTime();
160   BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x8, xnn_f32_vsqrt_ukernel__sse_sqrt_x8)
161     ->RangeMultiplier(10)
162     ->Range(1000, 1000000)
163     ->UseRealTime();
164 
165   BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x8, xnn_f32_vsqrt_ukernel__avx_sqrt_x8, benchmark::utils::CheckAVX)
166     ->RangeMultiplier(10)
167     ->Range(1000, 1000000)
168     ->UseRealTime();
169   BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x16, xnn_f32_vsqrt_ukernel__avx_sqrt_x16, benchmark::utils::CheckAVX)
170     ->RangeMultiplier(10)
171     ->Range(1000, 1000000)
172     ->UseRealTime();
173 
174   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x8, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8, benchmark::utils::CheckFMA3)
175     ->RangeMultiplier(10)
176     ->Range(1000, 1000000)
177     ->UseRealTime();
178   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x16, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16, benchmark::utils::CheckFMA3)
179     ->RangeMultiplier(10)
180     ->Range(1000, 1000000)
181     ->UseRealTime();
182   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x24, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24, benchmark::utils::CheckFMA3)
183     ->RangeMultiplier(10)
184     ->Range(1000, 1000000)
185     ->UseRealTime();
186   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x32, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32, benchmark::utils::CheckFMA3)
187     ->RangeMultiplier(10)
188     ->Range(1000, 1000000)
189     ->UseRealTime();
190   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x40, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40, benchmark::utils::CheckFMA3)
191     ->RangeMultiplier(10)
192     ->Range(1000, 1000000)
193     ->UseRealTime();
194   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x48, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48, benchmark::utils::CheckFMA3)
195     ->RangeMultiplier(10)
196     ->Range(1000, 1000000)
197     ->UseRealTime();
198   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x56, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56, benchmark::utils::CheckFMA3)
199     ->RangeMultiplier(10)
200     ->Range(1000, 1000000)
201     ->UseRealTime();
202   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x64, xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64, benchmark::utils::CheckFMA3)
203     ->RangeMultiplier(10)
204     ->Range(1000, 1000000)
205     ->UseRealTime();
206 
207   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x16, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16, benchmark::utils::CheckAVX512F)
208     ->RangeMultiplier(10)
209     ->Range(1000, 1000000)
210     ->UseRealTime();
211   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x32, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32, benchmark::utils::CheckAVX512F)
212     ->RangeMultiplier(10)
213     ->Range(1000, 1000000)
214     ->UseRealTime();
215   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x48, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48, benchmark::utils::CheckAVX512F)
216     ->RangeMultiplier(10)
217     ->Range(1000, 1000000)
218     ->UseRealTime();
219   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x64, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64, benchmark::utils::CheckAVX512F)
220     ->RangeMultiplier(10)
221     ->Range(1000, 1000000)
222     ->UseRealTime();
223   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x80, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80, benchmark::utils::CheckAVX512F)
224     ->RangeMultiplier(10)
225     ->Range(1000, 1000000)
226     ->UseRealTime();
227   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x96, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96, benchmark::utils::CheckAVX512F)
228     ->RangeMultiplier(10)
229     ->Range(1000, 1000000)
230     ->UseRealTime();
231   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x112, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112, benchmark::utils::CheckAVX512F)
232     ->RangeMultiplier(10)
233     ->Range(1000, 1000000)
234     ->UseRealTime();
235   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x128, xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128, benchmark::utils::CheckAVX512F)
236     ->RangeMultiplier(10)
237     ->Range(1000, 1000000)
238     ->UseRealTime();
239 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
240 
241 #if XNN_ARCH_WASMSIMD
242   BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x4, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x4)
243     ->RangeMultiplier(10)
244     ->Range(1000, 1000000)
245     ->UseRealTime();
246   BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x8, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8)
247     ->RangeMultiplier(10)
248     ->Range(1000, 1000000)
249     ->UseRealTime();
250 #endif  // XNN_ARCH_WASMSIMD
251 
252 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x1, xnn_f32_vsqrt_ukernel__scalar_sqrt_x1)
253   ->RangeMultiplier(10)
254   ->Range(1000, 1000000)
255   ->UseRealTime();
256 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x2, xnn_f32_vsqrt_ukernel__scalar_sqrt_x2)
257   ->RangeMultiplier(10)
258   ->Range(1000, 1000000)
259   ->UseRealTime();
260 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x4, xnn_f32_vsqrt_ukernel__scalar_sqrt_x4)
261   ->RangeMultiplier(10)
262   ->Range(1000, 1000000)
263   ->UseRealTime();
264 
265 #ifndef XNNPACK_BENCHMARK_NO_MAIN
266 BENCHMARK_MAIN();
267 #endif
268