1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 #include <xnnpack/AlignedAllocator.h>
21 #include <xnnpack/common.h>
22 #include <xnnpack/requantization-stubs.h>
23 
24 
25 class Requantization : public benchmark::Fixture {
26  public:
Requantization()27   inline Requantization()
28   {
29     cpuinfo_initialize();
30     const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
31     const size_t l1d_reserve = 1024;
32     n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
33     n_ = n_ / 16 * 16;
34   }
35 
SetUp(const benchmark::State &)36   virtual void SetUp(const benchmark::State&) override
37   {
38     std::random_device random_device;
39     auto rng = std::mt19937(random_device());
40     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
41 
42     input_.resize(n());
43     std::generate(input_.begin(), input_.end(), std::ref(i32rng));
44     output_.resize(n());
45     std::fill(output_.begin(), output_.end(), 0xA5);
46   }
47 
TearDown(benchmark::State & state)48   virtual void TearDown(benchmark::State& state) override
49   {
50     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
51     state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
52     input_.clear();
53     output_.clear();
54   }
55 
input() const56   inline const int32_t* input() const
57   {
58     return input_.data();
59   }
60 
output()61   inline int8_t* output()
62   {
63     return output_.data();
64   }
65 
n() const66   inline size_t n() const
67   {
68     return n_;
69   }
70 
71  protected:
72   std::vector<int32_t, AlignedAllocator<int32_t, 32>> input_;
73   std::vector<int8_t> output_;
74   size_t n_;
75 };
76 
BENCHMARK_F(Requantization,precise__scalar_unsigned32)77 BENCHMARK_F(Requantization, precise__scalar_unsigned32)(benchmark::State& state) {
78   for (auto _ : state) {
79     xnn_qs8_requantize_precise__scalar_unsigned32(
80         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
81   }
82 }
83 
BENCHMARK_F(Requantization,precise__scalar_unsigned64)84 BENCHMARK_F(Requantization, precise__scalar_unsigned64)(benchmark::State& state) {
85   for (auto _ : state) {
86     xnn_qs8_requantize_precise__scalar_unsigned64(
87         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
88   }
89 }
90 
BENCHMARK_F(Requantization,precise__scalar_signed64)91 BENCHMARK_F(Requantization, precise__scalar_signed64)(benchmark::State& state) {
92   for (auto _ : state) {
93     xnn_qs8_requantize_precise__scalar_signed64(
94         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
95   }
96 }
97 
BENCHMARK_F(Requantization,fp32__scalar_lrintf)98 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
99   for (auto _ : state) {
100     xnn_qs8_requantize_fp32__scalar_lrintf(
101         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
102   }
103 }
104 
BENCHMARK_F(Requantization,fp32__scalar_magic)105 BENCHMARK_F(Requantization, fp32__scalar_magic)(benchmark::State& state) {
106   for (auto _ : state) {
107     xnn_qs8_requantize_fp32__scalar_magic(
108         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
109   }
110 }
111 
BENCHMARK_F(Requantization,q31__scalar)112 BENCHMARK_F(Requantization, q31__scalar)(benchmark::State& state) {
113   for (auto _ : state) {
114     xnn_qs8_requantize_q31__scalar(
115         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
116   }
117 }
118 
119 
120 #if XNN_ARCH_WASMSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)121   BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
122     for (auto _ : state) {
123       xnn_qs8_requantize_fp32__wasmsimd(
124           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
125     }
126   }
127 
BENCHMARK_F(Requantization,q31__wasmsimd)128   BENCHMARK_F(Requantization, q31__wasmsimd)(benchmark::State& state) {
129     for (auto _ : state) {
130       xnn_qs8_requantize_q31__wasmsimd(
131           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
132     }
133   }
134 #endif
135 
136 
137 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,precise__neon)138   BENCHMARK_F(Requantization, precise__neon)(benchmark::State& state) {
139     for (auto _ : state) {
140       xnn_qs8_requantize_precise__neon(
141           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
142     }
143   }
144 
BENCHMARK_F(Requantization,fp32__neon)145   BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
146     for (auto _ : state) {
147       xnn_qs8_requantize_fp32__neon(
148           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
149     }
150   }
151 
BENCHMARK_F(Requantization,q31__neon)152   BENCHMARK_F(Requantization, q31__neon)(benchmark::State& state) {
153     for (auto _ : state) {
154       xnn_qs8_requantize_q31__neon(
155           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
156     }
157   }
158 #endif
159 
160 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,precise__sse2)161   BENCHMARK_F(Requantization, precise__sse2)(benchmark::State& state) {
162     for (auto _ : state) {
163       xnn_qs8_requantize_precise__sse2(
164           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
165     }
166   }
167 
BENCHMARK_F(Requantization,precise__ssse3)168   BENCHMARK_F(Requantization, precise__ssse3)(benchmark::State& state) {
169     for (auto _ : state) {
170       xnn_qs8_requantize_precise__ssse3(
171           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
172     }
173   }
174 
BENCHMARK_F(Requantization,precise__sse4)175   BENCHMARK_F(Requantization, precise__sse4)(benchmark::State& state) {
176     for (auto _ : state) {
177       xnn_qs8_requantize_precise__sse4(
178           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
179     }
180   }
181 
BENCHMARK_F(Requantization,fp32__sse2)182   BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
183     for (auto _ : state) {
184       xnn_qs8_requantize_fp32__sse2(
185           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
186     }
187   }
188 
BENCHMARK_F(Requantization,fp32__sse4)189   BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
190     for (auto _ : state) {
191       xnn_qs8_requantize_fp32__sse4(
192           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
193     }
194   }
195 
BENCHMARK_F(Requantization,q31__sse2)196   BENCHMARK_F(Requantization, q31__sse2)(benchmark::State& state) {
197     for (auto _ : state) {
198       xnn_qs8_requantize_q31__sse2(
199           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
200     }
201   }
202 
BENCHMARK_F(Requantization,q31__ssse3)203   BENCHMARK_F(Requantization, q31__ssse3)(benchmark::State& state) {
204     for (auto _ : state) {
205       xnn_qs8_requantize_q31__ssse3(
206           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
207     }
208   }
209 
BENCHMARK_F(Requantization,q31__sse4)210   BENCHMARK_F(Requantization, q31__sse4)(benchmark::State& state) {
211     for (auto _ : state) {
212       xnn_qs8_requantize_q31__sse4(
213           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
214     }
215   }
216 #endif
217 
218 #ifndef XNNPACK_BENCHMARK_NO_MAIN
219 BENCHMARK_MAIN();
220 #endif
221