1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 #include <xnnpack/AlignedAllocator.h>
15 #include <xnnpack/common.h>
16 #include <xnnpack/params.h>
17 #include <xnnpack/vunary.h>
18 
19 
f32_relu(benchmark::State & state,xnn_f32_relu_ukernel_function f32_relu,benchmark::utils::IsaCheckFunction isa_check=nullptr)20 static void f32_relu(
21   benchmark::State& state,
22   xnn_f32_relu_ukernel_function f32_relu,
23   benchmark::utils::IsaCheckFunction isa_check = nullptr)
24 {
25   if (isa_check && !isa_check(state)) {
26     return;
27   }
28 
29   const size_t elements = state.range(0);
30 
31   std::random_device random_device;
32   auto rng = std::mt19937(random_device());
33   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
34 
35   std::vector<float, AlignedAllocator<float, 64>> x(elements);
36   std::generate(x.begin(), x.end(), std::ref(f32rng));
37   std::vector<float, AlignedAllocator<float, 64>> y(elements);
38   std::generate(x.begin(), x.end(), std::ref(f32rng));
39 
40   for (auto _ : state) {
41     f32_relu(elements * sizeof(float), x.data(), y.data(), NULL);
42   }
43 
44   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
45   if (cpu_frequency != 0) {
46     state.counters["cpufreq"] = cpu_frequency;
47   }
48 
49   const size_t elements_per_iteration = elements;
50   state.counters["elements"] =
51     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
52 
53   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
54   state.counters["bytes"] =
55     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
56 }
57 
58 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
59   BENCHMARK_CAPTURE(f32_relu, sse_x4, xnn_f32_relu_ukernel__sse_x4)
60     ->RangeMultiplier(10)
61     ->Range(1000, 100000000)
62     ->UseRealTime();
63 
64   BENCHMARK_CAPTURE(f32_relu, sse_x8, xnn_f32_relu_ukernel__sse_x8)
65     ->RangeMultiplier(10)
66     ->Range(1000, 100000000)
67     ->UseRealTime();
68 
69   BENCHMARK_CAPTURE(f32_relu, avx_x8, xnn_f32_relu_ukernel__avx_x8, benchmark::utils::CheckAVX)
70     ->RangeMultiplier(10)
71     ->Range(1000, 100000000)
72     ->UseRealTime();
73 
74   BENCHMARK_CAPTURE(f32_relu, avx_x16, xnn_f32_relu_ukernel__avx_x16, benchmark::utils::CheckAVX)
75     ->RangeMultiplier(10)
76     ->Range(1000, 100000000)
77     ->UseRealTime();
78 
79   BENCHMARK_CAPTURE(f32_relu, avx512f_x16, xnn_f32_relu_ukernel__avx512f_x16, benchmark::utils::CheckAVX512F)
80     ->RangeMultiplier(10)
81     ->Range(1000, 100000000)
82     ->UseRealTime();
83 
84   BENCHMARK_CAPTURE(f32_relu, avx512f_x32, xnn_f32_relu_ukernel__avx512f_x32, benchmark::utils::CheckAVX512F)
85     ->RangeMultiplier(10)
86     ->Range(1000, 100000000)
87     ->UseRealTime();
88 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
89 
90 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
91   BENCHMARK_CAPTURE(f32_relu, neon_x4, xnn_f32_relu_ukernel__neon_x4, benchmark::utils::CheckNEON)
92     ->RangeMultiplier(10)
93     ->Range(1000, 100000000)
94     ->UseRealTime();
95   BENCHMARK_CAPTURE(f32_relu, neon_x8, xnn_f32_relu_ukernel__neon_x8, benchmark::utils::CheckNEON)
96     ->RangeMultiplier(10)
97     ->Range(1000, 100000000)
98     ->UseRealTime();
99 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
100 
101 
102 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
103   BENCHMARK_CAPTURE(f32_relu, wasm_x1, xnn_f32_relu_ukernel__wasm_x1)
104     ->RangeMultiplier(10)
105     ->Range(1000, 100000000)
106     ->UseRealTime();
107 
108   BENCHMARK_CAPTURE(f32_relu, wasm_x2, xnn_f32_relu_ukernel__wasm_x2)
109     ->RangeMultiplier(10)
110     ->Range(1000, 100000000)
111     ->UseRealTime();
112 
113   BENCHMARK_CAPTURE(f32_relu, wasm_x4, xnn_f32_relu_ukernel__wasm_x4)
114     ->RangeMultiplier(10)
115     ->Range(1000, 100000000)
116     ->UseRealTime();
117 
118   BENCHMARK_CAPTURE(f32_relu, wasm_x8, xnn_f32_relu_ukernel__wasm_x8)
119     ->RangeMultiplier(10)
120     ->Range(1000, 100000000)
121     ->UseRealTime();
122 
123   BENCHMARK_CAPTURE(f32_relu, wasm32_shr_x1, xnn_f32_relu_ukernel__wasm32_shr_x1)
124     ->RangeMultiplier(10)
125     ->Range(1000, 100000000)
126     ->UseRealTime();
127 
128   BENCHMARK_CAPTURE(f32_relu, wasm32_shr_x2, xnn_f32_relu_ukernel__wasm32_shr_x2)
129     ->RangeMultiplier(10)
130     ->Range(1000, 100000000)
131     ->UseRealTime();
132 
133   BENCHMARK_CAPTURE(f32_relu, wasm32_shr_x4, xnn_f32_relu_ukernel__wasm32_shr_x4)
134     ->RangeMultiplier(10)
135     ->Range(1000, 100000000)
136     ->UseRealTime();
137 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
138 
139 #if XNN_ARCH_WASMSIMD
140   BENCHMARK_CAPTURE(f32_relu, wasmsimd_x4, xnn_f32_relu_ukernel__wasmsimd_x4)
141     ->RangeMultiplier(10)
142     ->Range(1000, 100000000)
143     ->UseRealTime();
144 
145   BENCHMARK_CAPTURE(f32_relu, wasmsimd_x8, xnn_f32_relu_ukernel__wasmsimd_x8)
146     ->RangeMultiplier(10)
147     ->Range(1000, 100000000)
148     ->UseRealTime();
149 
150   BENCHMARK_CAPTURE(f32_relu, wasmsimd_x16, xnn_f32_relu_ukernel__wasmsimd_x16)
151     ->RangeMultiplier(10)
152     ->Range(1000, 100000000)
153     ->UseRealTime();
154 #endif  // XNN_ARCH_WASMSIMD
155 
156 BENCHMARK_CAPTURE(f32_relu, scalar_x1, xnn_f32_relu_ukernel__scalar_x1)
157   ->RangeMultiplier(10)
158   ->Range(1000, 100000000)
159   ->UseRealTime();
160 
161 BENCHMARK_CAPTURE(f32_relu, scalar_x2, xnn_f32_relu_ukernel__scalar_x2)
162   ->RangeMultiplier(10)
163   ->Range(1000, 100000000)
164   ->UseRealTime();
165 
166 BENCHMARK_CAPTURE(f32_relu, scalar_x4, xnn_f32_relu_ukernel__scalar_x4)
167   ->RangeMultiplier(10)
168   ->Range(1000, 100000000)
169   ->UseRealTime();
170 
171 BENCHMARK_CAPTURE(f32_relu, scalar_x8, xnn_f32_relu_ukernel__scalar_x8)
172   ->RangeMultiplier(10)
173   ->Range(1000, 100000000)
174   ->UseRealTime();
175 
176 #ifndef XNNPACK_BENCHMARK_NO_MAIN
177 BENCHMARK_MAIN();
178 #endif
179