1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <limits>
14 #include <random>
15 #include <vector>
16 
17 #include <xnnpack.h>
18 
19 #include <benchmark/benchmark.h>
20 #ifdef BENCHMARK_TENSORFLOW_LITE
21 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
22 #include "tensorflow/lite/interpreter.h"
23 #include "tensorflow/lite/kernels/register.h"
24 #include "tensorflow/lite/model.h"
25 #include "tensorflow/lite/schema/schema_generated.h"
26 #include "tensorflow/lite/version.h"
27 #endif  // BENCHMARK_TENSORFLOW_LITE
28 #include "bench/utils.h"
29 
30 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_average_pooling_qu8(benchmark::State & state,const char * net)31 static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
32   const size_t batch_size = state.range(0);
33   const size_t input_height = state.range(1);
34   const size_t input_width = state.range(2);
35   const size_t pooling_size = state.range(3);
36   const size_t padding_size = state.range(4);
37   const size_t stride = state.range(5);
38   const size_t channels = state.range(6);
39 
40   std::random_device random_device;
41   auto rng = std::mt19937(random_device());
42   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
43 
44   const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
45   const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
46 
47   std::vector<uint8_t> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(uint8_t));
48   std::generate(input.begin(), input.end(), std::ref(u8rng));
49   std::vector<uint8_t> output(batch_size * output_height * output_width * channels);
50   std::fill(output.begin(), output.end(), 0xA5);
51 
52   xnn_status status = xnn_initialize(nullptr /* allocator */);
53   if (status != xnn_status_success) {
54     state.SkipWithError("failed to initialize XNNPACK");
55     return;
56   }
57 
58   xnn_operator_t pooling_op = nullptr;
59   status = xnn_create_average_pooling2d_nhwc_qu8(
60     padding_size, padding_size, padding_size, padding_size,
61     pooling_size, pooling_size,
62     stride, stride,
63     channels, channels /* input pixel stride */, channels /* output pixel stride */,
64     127 /* input zero point */, 0.75f /* input scale */,
65     127 /* output zero point */, 1.25f /* output scale */,
66     0, 255,
67     0 /* flags */, &pooling_op);
68   if (status != xnn_status_success) {
69     state.SkipWithError("failed to create Average Pooling operator");
70     return;
71   }
72 
73   status = xnn_setup_average_pooling2d_nhwc_qu8(
74     pooling_op,
75     batch_size, input_height, input_width,
76     input.data(), output.data(),
77     nullptr /* thread pool */);
78   if (status != xnn_status_success) {
79     state.SkipWithError("failed to setup Average Pooling operator");
80     return;
81   }
82 
83   for (auto _ : state) {
84     status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
85     if (status != xnn_status_success) {
86       state.SkipWithError("failed to run Average Pooling operator");
87       return;
88     }
89   }
90 
91   status = xnn_delete_operator(pooling_op);
92   if (status != xnn_status_success) {
93     state.SkipWithError("failed to delete Average Pooling operator");
94     return;
95   }
96   pooling_op = nullptr;
97 
98   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
99   if (cpu_frequency != 0) {
100     state.counters["cpufreq"] = cpu_frequency;
101   }
102 
103   state.counters["bytes"] = benchmark::Counter(
104     uint64_t(state.iterations()) *
105       batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(uint8_t),
106     benchmark::Counter::kIsRate);
107 }
108 #endif  // XNN_NO_QU8_OPERATORS
109 
xnnpack_average_pooling_f32(benchmark::State & state,const char * net)110 static void xnnpack_average_pooling_f32(benchmark::State& state, const char* net) {
111   const size_t batch_size = state.range(0);
112   const size_t input_height = state.range(1);
113   const size_t input_width = state.range(2);
114   const size_t pooling_size = state.range(3);
115   const size_t padding_size = state.range(4);
116   const size_t stride = state.range(5);
117   const size_t channels = state.range(6);
118 
119   std::random_device random_device;
120   auto rng = std::mt19937(random_device());
121   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
122 
123   const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
124   const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
125 
126   std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
127   std::generate(input.begin(), input.end(), std::ref(f32rng));
128   std::vector<float> output(batch_size * output_height * output_width * channels);
129   std::fill(output.begin(), output.end(), std::nanf(""));
130 
131   xnn_status status = xnn_initialize(nullptr /* allocator */);
132   if (status != xnn_status_success) {
133     state.SkipWithError("failed to initialize XNNPACK");
134     return;
135   }
136 
137   xnn_operator_t pooling_op = nullptr;
138   status = xnn_create_average_pooling2d_nhwc_f32(
139     padding_size, padding_size, padding_size, padding_size,
140     pooling_size, pooling_size,
141     stride, stride,
142     channels, channels /* input pixel stride */, channels /* output pixel stride */,
143     -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
144     0 /* flags */, &pooling_op);
145   if (status != xnn_status_success) {
146     state.SkipWithError("failed to create Average Pooling operator");
147     return;
148   }
149 
150   status = xnn_setup_average_pooling2d_nhwc_f32(
151     pooling_op,
152     batch_size, input_height, input_width,
153     input.data(), output.data(),
154     nullptr /* thread pool */);
155   if (status != xnn_status_success) {
156     state.SkipWithError("failed to setup Average Pooling operator");
157     return;
158   }
159 
160   for (auto _ : state) {
161     status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
162     if (status != xnn_status_success) {
163       state.SkipWithError("failed to run Average Pooling operator");
164       return;
165     }
166   }
167 
168   status = xnn_delete_operator(pooling_op);
169   if (status != xnn_status_success) {
170     state.SkipWithError("failed to delete Average Pooling operator");
171     return;
172   }
173   pooling_op = nullptr;
174 
175   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
176   if (cpu_frequency != 0) {
177     state.counters["cpufreq"] = cpu_frequency;
178   }
179 
180   state.counters["bytes"] = benchmark::Counter(
181     uint64_t(state.iterations()) *
182       batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
183     benchmark::Counter::kIsRate);
184 }
185 
186 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_average_pooling_f32(benchmark::State & state,const char * net)187 void tflite_average_pooling_f32(benchmark::State& state, const char* net) {
188   const size_t batch_size = state.range(0);
189   const size_t input_height = state.range(1);
190   const size_t input_width = state.range(2);
191   const size_t pooling_size = state.range(3);
192   const size_t padding_size = state.range(4);
193   const size_t stride = state.range(5);
194   const size_t channels = state.range(6);
195 
196   std::random_device random_device;
197   auto rng = std::mt19937(random_device());
198   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
199 
200   tflite::Padding padding = tflite::Padding_VALID;
201   if (2 * padding_size == (pooling_size - 1)) {
202     padding = tflite::Padding_SAME;
203   } else if (padding_size == 0) {
204     padding = tflite::Padding_VALID;
205   } else {
206     state.SkipWithError("unsupported padding");
207     return;
208   }
209 
210   const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
211   const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
212 
213   std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
214   std::generate(input.begin(), input.end(), std::ref(f32rng));
215   std::vector<float> output(batch_size * output_height * output_width * channels);
216   std::fill(output.begin(), output.end(), std::nanf(""));
217 
218   flatbuffers::FlatBufferBuilder builder;
219   flatbuffers::Offset<tflite::OperatorCode> operator_code =
220       CreateOperatorCode(builder, tflite::BuiltinOperator_AVERAGE_POOL_2D);
221 
222   flatbuffers::Offset<tflite::Pool2DOptions> pool2d_options = CreatePool2DOptions(
223       builder, padding,
224       stride /* stride_w */, stride /* stride_h */,
225       pooling_size /* filter_width */, pooling_size /* filter_height */,
226       tflite::ActivationFunctionType_NONE);
227 
228   flatbuffers::Offset<tflite::Buffer> buffers[1] = {
229     tflite::CreateBuffer(builder, builder.CreateVector({})),
230   };
231 
232   const int32_t input_shape[4] = {
233     static_cast<int32_t>(batch_size),
234     static_cast<int32_t>(input_height),
235     static_cast<int32_t>(input_width),
236     static_cast<int32_t>(channels)
237   };
238   const int32_t output_shape[4] = {
239     static_cast<int32_t>(batch_size),
240     static_cast<int32_t>(output_height),
241     static_cast<int32_t>(output_width),
242     static_cast<int32_t>(channels)
243   };
244 
245   flatbuffers::Offset<tflite::Tensor> tensors[2] = {
246     tflite::CreateTensor(builder,
247                          builder.CreateVector<int32_t>(input_shape, 4),
248                          tflite::TensorType_FLOAT32),
249     tflite::CreateTensor(builder,
250                          builder.CreateVector<int32_t>(output_shape, 4),
251                          tflite::TensorType_FLOAT32),
252   };
253 
254   const int32_t op_inputs[1] = { 0 };
255   const int32_t op_outputs[1] = { 1 };
256   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
257       builder,
258       0 /* opcode_index */,
259       builder.CreateVector<int32_t>(op_inputs, 1),
260       builder.CreateVector<int32_t>(op_outputs, 1),
261       tflite::BuiltinOptions_Pool2DOptions,
262       pool2d_options.Union());
263 
264   const int32_t graph_inputs[1] = { 0 };
265   const int32_t graph_outputs[1] = { 1 };
266   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
267       builder,
268       builder.CreateVector(tensors, 2),
269       builder.CreateVector<int32_t>(graph_inputs, 1),
270       builder.CreateVector<int32_t>(graph_outputs, 1),
271       builder.CreateVector(&op, 1));
272 
273   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
274       TFLITE_SCHEMA_VERSION,
275       builder.CreateVector(&operator_code, 1),
276       builder.CreateVector(&subgraph, 1),
277       builder.CreateString("AVERAGE_POOL_2D model"),
278       builder.CreateVector(buffers, 1));
279 
280   builder.Finish(model_buffer);
281 
282   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
283   tflite::ops::builtin::BuiltinOpResolver resolver;
284   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
285   std::unique_ptr<tflite::Interpreter> interpreter;
286   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
287     state.SkipWithError("failed to create TFLite interpreter");
288     return;
289   }
290   if (interpreter == nullptr) {
291     state.SkipWithError("TFLite interpreter is null");
292     return;
293   }
294   interpreter->SetNumThreads(1);
295 
296   if (interpreter->AllocateTensors() != kTfLiteOk) {
297     state.SkipWithError("failed to allocate tensors");
298     return;
299   }
300 
301   std::generate(
302     interpreter->typed_tensor<float>(0),
303     interpreter->typed_tensor<float>(0) + batch_size * input_height * input_width * channels,
304     std::ref(f32rng));
305 
306   for (auto _ : state) {
307     if (interpreter->Invoke() != kTfLiteOk) {
308       state.SkipWithError("failed to invoke TFLite interpreter");
309       return;
310     }
311   }
312 
313   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
314   if (cpu_frequency != 0) {
315     state.counters["cpufreq"] = cpu_frequency;
316   }
317 
318   state.counters["bytes"] = benchmark::Counter(
319     uint64_t(state.iterations()) *
320       batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
321     benchmark::Counter::kIsRate);
322 }
323 #endif  // BENCHMARK_TENSORFLOW_LITE
324 
325 // Final global average pooling in ImageNet classification models.
ImageNet(benchmark::internal::Benchmark * b)326 static void ImageNet(benchmark::internal::Benchmark* b) {
327   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
328 
329   /*       N   H   W   K  P  S   C */
330   b->Args({1, 13, 13, 13, 0, 1, 1000});
331   b->Args({1,  7,  7,  7, 0, 1, 1000});
332 }
333 
334 // ShuffleNet v1 with 1 group.
ShuffleNetV1G1(benchmark::internal::Benchmark * b)335 static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
336   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
337 
338   /*       N   H   W  K  P  S   C */
339   b->Args({1, 56, 56, 3, 1, 2,  24});
340   b->Args({1, 28, 28, 3, 1, 2, 144});
341   b->Args({1, 14, 14, 3, 1, 2, 288});
342   b->Args({1,  7,  7, 3, 1, 2, 576});
343 }
344 
345 // ShuffleNet v1 with 2 groups.
ShuffleNetV1G2(benchmark::internal::Benchmark * b)346 static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
347   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
348 
349   /*       N   H   W  K  P  S   C */
350   b->Args({1, 56, 56, 3, 1, 2,  24});
351   b->Args({1, 28, 28, 3, 1, 2, 200});
352   b->Args({1, 14, 14, 3, 1, 2, 400});
353   b->Args({1,  7,  7, 3, 1, 2, 800});
354 }
355 
356 // ShuffleNet v1 with 3 groups.
ShuffleNetV1G3(benchmark::internal::Benchmark * b)357 static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
358   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
359 
360   /*       N   H   W  K  P  S   C */
361   b->Args({1, 56, 56, 3, 1, 2,  24});
362   b->Args({1, 28, 28, 3, 1, 2, 240});
363   b->Args({1, 14, 14, 3, 1, 2, 480});
364   b->Args({1,  7,  7, 3, 1, 2, 960});
365 }
366 
367 // ShuffleNet v1 with 4 groups.
ShuffleNetV1G4(benchmark::internal::Benchmark * b)368 static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
369   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
370 
371   /*       N   H   W  K  P  S    C */
372   b->Args({1, 56, 56, 3, 1, 2,   24});
373   b->Args({1, 28, 28, 3, 1, 2,  272});
374   b->Args({1, 14, 14, 3, 1, 2,  576});
375   b->Args({1,  7,  7, 3, 1, 2, 1088});
376 }
377 
378 // ShuffleNet v1 with 8 groups.
ShuffleNetV1G8(benchmark::internal::Benchmark * b)379 static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
380   b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
381 
382   /*       N   H   W  K  P  S    C */
383   b->Args({1, 56, 56, 3, 1, 2,   24});
384   b->Args({1, 28, 28, 3, 1, 2,  384});
385   b->Args({1, 14, 14, 3, 1, 2,  768});
386   b->Args({1,  7,  7, 3, 1, 2, 1536});
387 }
388 
389 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
390 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
391 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
392 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
393 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
394 BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
395 
396 #ifdef BENCHMARK_TENSORFLOW_LITE
397 BENCHMARK_CAPTURE(tflite_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
398 BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
399 BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
400 BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
401 BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
402 BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
403 #endif  // BENCHMARK_TENSORFLOW_LITE
404 
405 #ifndef XNN_NO_QU8_OPERATORS
406 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
407 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
408 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
409 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
410 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
411 BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
412 #endif  // XNN_NO_QU8_OPERATORS
413 
414 #ifndef XNNPACK_BENCHMARK_NO_MAIN
415 BENCHMARK_MAIN();
416 #endif
417