1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <xnnpack.h>
13
14 #include <benchmark/benchmark.h>
15
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/dwconv.h>
20 #include <xnnpack/params.h>
21
22
DWConvEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv,uint8_t channel_tile,uint8_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void DWConvEnd2EndBenchmark(
24 benchmark::State& state,
25 models::ExecutionPlanFactory model_factory,
26 xnn_f32_dwconv_minmax_unipass_ukernel_function dwconv,
27 uint8_t channel_tile, uint8_t primary_tile,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
34 state.SkipWithError("failed to initialize XNNPACK");
35 return;
36 }
37
38 // Override microkernels chosen in xnn_initialize
39 for (size_t i = 0; i < XNN_MAX_F32_DWCONV_UKERNELS; i++) {
40 // Replace only the microkernel the matching kernel size.
41 if (xnn_params.f32.dwconv[i].primary_tile == primary_tile) {
42 // Note: do not directly assign to xnn_params.f32.dwconv[i] because it breaks older gcc.
43 xnn_params.f32.dwconv[i].minmax.unipass = xnn_dwconv_unipass_ukernel_function(dwconv);
44 xnn_params.f32.dwconv[i].channel_tile = channel_tile;
45 xnn_params.f32.dwconv[i].primary_tile = primary_tile;
46 xnn_params.f32.dwconv[i].incremental_tile = 0;
47 break;
48 }
49 }
50
51 auto execution_plan = model_factory(nullptr);
52 if (execution_plan.empty()) {
53 state.SkipWithError("failed to create a model");
54 return;
55 }
56
57 for (auto _ : state) {
58 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
59 xnn_status status = xnn_run_operator(op.get(), nullptr);
60 if (status != xnn_status_success) {
61 state.SkipWithError("failed to run a model");
62 return;
63 }
64 }
65 }
66
67 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
68 if (cpu_frequency != 0) {
69 state.counters["cpufreq"] = cpu_frequency;
70 }
71 }
72
73 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_dwconv_up4x9__aarch64_neonfma(benchmark::State & state,models::ExecutionPlanFactory model)74 static void f32_dwconv_up4x9__aarch64_neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
75 DWConvEnd2EndBenchmark(state, model,
76 xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma,
77 4 /* cr */, 9 /* mr */);
78 }
79
f32_dwconv_up4x9__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)80 static void f32_dwconv_up4x9__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
81 DWConvEnd2EndBenchmark(state, model,
82 xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55,
83 4 /* cr */, 9 /* mr */);
84 }
85
86 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__aarch64_neonfma);
87 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__aarch64_neonfma_cortex_a55);
88 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
89
90 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
f32_dwconv_up4x9__neon(benchmark::State & state,models::ExecutionPlanFactory model)91 static void f32_dwconv_up4x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
92 DWConvEnd2EndBenchmark(state, model,
93 xnn_f32_dwconv_minmax_ukernel_up4x9__neon,
94 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
95 }
96
f32_dwconv_up4x9__neon_acc2(benchmark::State & state,models::ExecutionPlanFactory model)97 static void f32_dwconv_up4x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
98 DWConvEnd2EndBenchmark(state, model,
99 xnn_f32_dwconv_minmax_ukernel_up4x9__neon_acc2,
100 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
101 }
102
f32_dwconv_up8x9__neon(benchmark::State & state,models::ExecutionPlanFactory model)103 static void f32_dwconv_up8x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
104 DWConvEnd2EndBenchmark(state, model,
105 xnn_f32_dwconv_minmax_ukernel_up8x9__neon,
106 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
107 }
108
f32_dwconv_up8x9__neon_acc2(benchmark::State & state,models::ExecutionPlanFactory model)109 static void f32_dwconv_up8x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
110 DWConvEnd2EndBenchmark(state, model,
111 xnn_f32_dwconv_minmax_ukernel_up8x9__neon_acc2,
112 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
113 }
114
f32_dwconv_up4x9__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)115 static void f32_dwconv_up4x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
116 DWConvEnd2EndBenchmark(state, model,
117 xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma,
118 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
119 }
120
f32_dwconv_up4x9__neonfma_acc2(benchmark::State & state,models::ExecutionPlanFactory model)121 static void f32_dwconv_up4x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
122 DWConvEnd2EndBenchmark(state, model,
123 xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma_acc2,
124 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
125 }
126
f32_dwconv_up8x9__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)127 static void f32_dwconv_up8x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
128 DWConvEnd2EndBenchmark(state, model,
129 xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma,
130 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
131 }
132
f32_dwconv_up8x9__neonfma_acc2(benchmark::State & state,models::ExecutionPlanFactory model)133 static void f32_dwconv_up8x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
134 DWConvEnd2EndBenchmark(state, model,
135 xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma_acc2,
136 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
137 }
138
139 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__neon);
140 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__neon_acc2);
141 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__neon);
142 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__neon_acc2);
143
144 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__neonfma);
145 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__neonfma_acc2);
146 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__neonfma);
147 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__neonfma_acc2);
148 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
149
150
151 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_dwconv_up4x9__sse(benchmark::State & state,models::ExecutionPlanFactory model)152 static void f32_dwconv_up4x9__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
153 DWConvEnd2EndBenchmark(state, model,
154 xnn_f32_dwconv_minmax_ukernel_up4x9__sse,
155 4 /* cr */, 9 /* mr */);
156 }
f32_dwconv_up4x9__sse_acc2(benchmark::State & state,models::ExecutionPlanFactory model)157 static void f32_dwconv_up4x9__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
158 DWConvEnd2EndBenchmark(state, model,
159 xnn_f32_dwconv_minmax_ukernel_up4x9__sse_acc2,
160 4 /* cr */, 9 /* mr */);
161 }
f32_dwconv_up8x9__sse(benchmark::State & state,models::ExecutionPlanFactory model)162 static void f32_dwconv_up8x9__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
163 DWConvEnd2EndBenchmark(state, model,
164 xnn_f32_dwconv_minmax_ukernel_up8x9__sse,
165 8 /* cr */, 9 /* mr */);
166 }
f32_dwconv_up8x9__sse_acc2(benchmark::State & state,models::ExecutionPlanFactory model)167 static void f32_dwconv_up8x9__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
168 DWConvEnd2EndBenchmark(state, model,
169 xnn_f32_dwconv_minmax_ukernel_up8x9__sse_acc2,
170 8 /* cr */, 9 /* mr */);
171 }
172
f32_dwconv_up8x9__avx(benchmark::State & state,models::ExecutionPlanFactory model)173 static void f32_dwconv_up8x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
174 DWConvEnd2EndBenchmark(state, model,
175 xnn_f32_dwconv_minmax_ukernel_up8x9__avx,
176 8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
177 }
f32_dwconv_up8x9__avx_acc2(benchmark::State & state,models::ExecutionPlanFactory model)178 static void f32_dwconv_up8x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
179 DWConvEnd2EndBenchmark(state, model,
180 xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2,
181 8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
182 }
f32_dwconv_up16x9__avx(benchmark::State & state,models::ExecutionPlanFactory model)183 static void f32_dwconv_up16x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
184 DWConvEnd2EndBenchmark(state, model,
185 xnn_f32_dwconv_minmax_ukernel_up16x9__avx,
186 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
187 }
f32_dwconv_up16x9__avx_acc2(benchmark::State & state,models::ExecutionPlanFactory model)188 static void f32_dwconv_up16x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
189 DWConvEnd2EndBenchmark(state, model,
190 xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2,
191 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
192 }
193
f32_dwconv_up8x9__fma3(benchmark::State & state,models::ExecutionPlanFactory model)194 static void f32_dwconv_up8x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
195 DWConvEnd2EndBenchmark(state, model,
196 xnn_f32_dwconv_minmax_ukernel_up8x9__fma3,
197 8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
198 }
f32_dwconv_up8x9__fma3_acc2(benchmark::State & state,models::ExecutionPlanFactory model)199 static void f32_dwconv_up8x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
200 DWConvEnd2EndBenchmark(state, model,
201 xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2,
202 8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
203 }
f32_dwconv_up16x9__fma3(benchmark::State & state,models::ExecutionPlanFactory model)204 static void f32_dwconv_up16x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
205 DWConvEnd2EndBenchmark(state, model,
206 xnn_f32_dwconv_minmax_ukernel_up16x9__fma3,
207 16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
208 }
f32_dwconv_up16x9__fma3_acc2(benchmark::State & state,models::ExecutionPlanFactory model)209 static void f32_dwconv_up16x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
210 DWConvEnd2EndBenchmark(state, model,
211 xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2,
212 16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
213 }
214
f32_dwconv_up16x9__avx512f(benchmark::State & state,models::ExecutionPlanFactory model)215 static void f32_dwconv_up16x9__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) {
216 DWConvEnd2EndBenchmark(state, model,
217 xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f,
218 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX512F);
219 }
f32_dwconv_up16x9__avx512f_acc2(benchmark::State & state,models::ExecutionPlanFactory model)220 static void f32_dwconv_up16x9__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
221 DWConvEnd2EndBenchmark(state, model,
222 xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f_acc2,
223 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX512F);
224 }
f32_dwconv_up32x9__avx512f(benchmark::State & state,models::ExecutionPlanFactory model)225 static void f32_dwconv_up32x9__avx512f(benchmark::State& state, models::ExecutionPlanFactory model) {
226 DWConvEnd2EndBenchmark(state, model,
227 xnn_f32_dwconv_minmax_ukernel_up32x9__avx512f,
228 32 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX512F);
229 }
f32_dwconv_up32x9__avx512f_acc2(benchmark::State & state,models::ExecutionPlanFactory model)230 static void f32_dwconv_up32x9__avx512f_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
231 DWConvEnd2EndBenchmark(state, model,
232 xnn_f32_dwconv_minmax_ukernel_up32x9__avx512f_acc2,
233 32 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX512F);
234 }
235
236 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f);
237 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f_acc2);
238 BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f);
239 BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f_acc2);
240
241 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__fma3);
242 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__fma3_acc2);
243 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__fma3);
244 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__fma3_acc2);
245
246 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx);
247 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx_acc2);
248 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx);
249 BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx_acc2);
250
251 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse);
252 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse_acc2);
253 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse);
254 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse_acc2);
255 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
256
257 #if XNN_ARCH_WASMSIMD
f32_dwconv_up4x9__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)258 static void f32_dwconv_up4x9__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
259 DWConvEnd2EndBenchmark(state, model,
260 xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm,
261 4 /* cr */, 9 /* mr */);
262 }
263
f32_dwconv_up4x9__wasmsimd_arm_acc2(benchmark::State & state,models::ExecutionPlanFactory model)264 static void f32_dwconv_up4x9__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
265 DWConvEnd2EndBenchmark(state, model,
266 xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm_acc2,
267 4 /* cr */, 9 /* mr */);
268 }
269
f32_dwconv_up8x9__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)270 static void f32_dwconv_up8x9__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
271 DWConvEnd2EndBenchmark(state, model,
272 xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_arm,
273 8 /* cr */, 9 /* mr */);
274 }
275
f32_dwconv_up8x9__wasmsimd_arm_acc2(benchmark::State & state,models::ExecutionPlanFactory model)276 static void f32_dwconv_up8x9__wasmsimd_arm_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
277 DWConvEnd2EndBenchmark(state, model,
278 xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_arm_acc2,
279 8 /* cr */, 9 /* mr */);
280 }
281
f32_dwconv_up4x9__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)282 static void f32_dwconv_up4x9__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
283 DWConvEnd2EndBenchmark(state, model,
284 xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_x86,
285 4 /* cr */, 9 /* mr */);
286 }
287
f32_dwconv_up4x9__wasmsimd_x86_acc2(benchmark::State & state,models::ExecutionPlanFactory model)288 static void f32_dwconv_up4x9__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
289 DWConvEnd2EndBenchmark(state, model,
290 xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_x86_acc2,
291 4 /* cr */, 9 /* mr */);
292 }
293
f32_dwconv_up8x9__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)294 static void f32_dwconv_up8x9__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
295 DWConvEnd2EndBenchmark(state, model,
296 xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86,
297 8 /* cr */, 9 /* mr */);
298 }
299
f32_dwconv_up8x9__wasmsimd_x86_acc2(benchmark::State & state,models::ExecutionPlanFactory model)300 static void f32_dwconv_up8x9__wasmsimd_x86_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
301 DWConvEnd2EndBenchmark(state, model,
302 xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86_acc2,
303 8 /* cr */, 9 /* mr */);
304 }
305
306 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__wasmsimd_arm);
307 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__wasmsimd_arm_acc2);
308 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__wasmsimd_arm);
309 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__wasmsimd_arm_acc2);
310
311 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__wasmsimd_x86);
312 BENCHMARK_FP32_END2END(f32_dwconv_up4x9__wasmsimd_x86_acc2);
313 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__wasmsimd_x86);
314 BENCHMARK_FP32_END2END(f32_dwconv_up8x9__wasmsimd_x86_acc2);
315 #endif // XNN_ARCH_WASMSIMD
316
f32_dwconv_up1x9__scalar(benchmark::State & state,models::ExecutionPlanFactory model)317 static void f32_dwconv_up1x9__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
318 DWConvEnd2EndBenchmark(state, model,
319 xnn_f32_dwconv_minmax_ukernel_up1x9__scalar,
320 1 /* cr */, 9 /* mr */);
321 }
322
f32_dwconv_up1x9__scalar_acc2(benchmark::State & state,models::ExecutionPlanFactory model)323 static void f32_dwconv_up1x9__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
324 DWConvEnd2EndBenchmark(state, model,
325 xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2,
326 1 /* cr */, 9 /* mr */);
327 }
328
f32_dwconv_up2x9__scalar(benchmark::State & state,models::ExecutionPlanFactory model)329 static void f32_dwconv_up2x9__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
330 DWConvEnd2EndBenchmark(state, model,
331 xnn_f32_dwconv_minmax_ukernel_up2x9__scalar,
332 2 /* cr */, 9 /* mr */);
333 }
334
f32_dwconv_up2x9__scalar_acc2(benchmark::State & state,models::ExecutionPlanFactory model)335 static void f32_dwconv_up2x9__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
336 DWConvEnd2EndBenchmark(state, model,
337 xnn_f32_dwconv_minmax_ukernel_up2x9__scalar_acc2,
338 2 /* cr */, 9 /* mr */);
339 }
340
341 BENCHMARK_FP32_END2END(f32_dwconv_up1x9__scalar);
342 BENCHMARK_FP32_END2END(f32_dwconv_up1x9__scalar_acc2);
343 BENCHMARK_FP32_END2END(f32_dwconv_up2x9__scalar);
344 BENCHMARK_FP32_END2END(f32_dwconv_up2x9__scalar_acc2);
345
346 #ifndef XNNPACK_BENCHMARK_NO_MAIN
347 BENCHMARK_MAIN();
348 #endif
349