1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 #include <xnnpack/AlignedAllocator.h>
21 #include <xnnpack/common.h>
22 #include <xnnpack/requantization-stubs.h>
23
24
25 class Requantization : public benchmark::Fixture {
26 public:
Requantization()27 inline Requantization()
28 {
29 cpuinfo_initialize();
30 const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
31 const size_t l1d_reserve = 1024;
32 n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
33 n_ = n_ / 16 * 16;
34 }
35
SetUp(const benchmark::State &)36 virtual void SetUp(const benchmark::State&) override
37 {
38 std::random_device random_device;
39 auto rng = std::mt19937(random_device());
40 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
41
42 input_.resize(n());
43 std::generate(input_.begin(), input_.end(), std::ref(i32rng));
44 output_.resize(n());
45 std::fill(output_.begin(), output_.end(), 0xA5);
46 }
47
TearDown(benchmark::State & state)48 virtual void TearDown(benchmark::State& state) override
49 {
50 state.SetItemsProcessed(uint64_t(state.iterations()) * n());
51 state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
52 input_.clear();
53 output_.clear();
54 }
55
input() const56 inline const int32_t* input() const
57 {
58 return input_.data();
59 }
60
output()61 inline int8_t* output()
62 {
63 return output_.data();
64 }
65
n() const66 inline size_t n() const
67 {
68 return n_;
69 }
70
71 protected:
72 std::vector<int32_t, AlignedAllocator<int32_t, 32>> input_;
73 std::vector<int8_t> output_;
74 size_t n_;
75 };
76
BENCHMARK_F(Requantization,precise__scalar_unsigned32)77 BENCHMARK_F(Requantization, precise__scalar_unsigned32)(benchmark::State& state) {
78 for (auto _ : state) {
79 xnn_qs8_requantize_precise__scalar_unsigned32(
80 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
81 }
82 }
83
BENCHMARK_F(Requantization,precise__scalar_unsigned64)84 BENCHMARK_F(Requantization, precise__scalar_unsigned64)(benchmark::State& state) {
85 for (auto _ : state) {
86 xnn_qs8_requantize_precise__scalar_unsigned64(
87 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
88 }
89 }
90
BENCHMARK_F(Requantization,precise__scalar_signed64)91 BENCHMARK_F(Requantization, precise__scalar_signed64)(benchmark::State& state) {
92 for (auto _ : state) {
93 xnn_qs8_requantize_precise__scalar_signed64(
94 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
95 }
96 }
97
BENCHMARK_F(Requantization,fp32__scalar_lrintf)98 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
99 for (auto _ : state) {
100 xnn_qs8_requantize_fp32__scalar_lrintf(
101 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
102 }
103 }
104
BENCHMARK_F(Requantization,fp32__scalar_magic)105 BENCHMARK_F(Requantization, fp32__scalar_magic)(benchmark::State& state) {
106 for (auto _ : state) {
107 xnn_qs8_requantize_fp32__scalar_magic(
108 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
109 }
110 }
111
BENCHMARK_F(Requantization,q31__scalar)112 BENCHMARK_F(Requantization, q31__scalar)(benchmark::State& state) {
113 for (auto _ : state) {
114 xnn_qs8_requantize_q31__scalar(
115 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
116 }
117 }
118
119
120 #if XNN_ARCH_WASMSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)121 BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
122 for (auto _ : state) {
123 xnn_qs8_requantize_fp32__wasmsimd(
124 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
125 }
126 }
127
BENCHMARK_F(Requantization,q31__wasmsimd)128 BENCHMARK_F(Requantization, q31__wasmsimd)(benchmark::State& state) {
129 for (auto _ : state) {
130 xnn_qs8_requantize_q31__wasmsimd(
131 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
132 }
133 }
134 #endif
135
136
137 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,precise__neon)138 BENCHMARK_F(Requantization, precise__neon)(benchmark::State& state) {
139 for (auto _ : state) {
140 xnn_qs8_requantize_precise__neon(
141 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
142 }
143 }
144
BENCHMARK_F(Requantization,fp32__neon)145 BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
146 for (auto _ : state) {
147 xnn_qs8_requantize_fp32__neon(
148 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
149 }
150 }
151
BENCHMARK_F(Requantization,q31__neon)152 BENCHMARK_F(Requantization, q31__neon)(benchmark::State& state) {
153 for (auto _ : state) {
154 xnn_qs8_requantize_q31__neon(
155 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
156 }
157 }
158 #endif
159
160 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,precise__sse2)161 BENCHMARK_F(Requantization, precise__sse2)(benchmark::State& state) {
162 for (auto _ : state) {
163 xnn_qs8_requantize_precise__sse2(
164 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
165 }
166 }
167
BENCHMARK_F(Requantization,precise__ssse3)168 BENCHMARK_F(Requantization, precise__ssse3)(benchmark::State& state) {
169 for (auto _ : state) {
170 xnn_qs8_requantize_precise__ssse3(
171 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
172 }
173 }
174
BENCHMARK_F(Requantization,precise__sse4)175 BENCHMARK_F(Requantization, precise__sse4)(benchmark::State& state) {
176 for (auto _ : state) {
177 xnn_qs8_requantize_precise__sse4(
178 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
179 }
180 }
181
BENCHMARK_F(Requantization,fp32__sse2)182 BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
183 for (auto _ : state) {
184 xnn_qs8_requantize_fp32__sse2(
185 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
186 }
187 }
188
BENCHMARK_F(Requantization,fp32__sse4)189 BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
190 for (auto _ : state) {
191 xnn_qs8_requantize_fp32__sse4(
192 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
193 }
194 }
195
BENCHMARK_F(Requantization,q31__sse2)196 BENCHMARK_F(Requantization, q31__sse2)(benchmark::State& state) {
197 for (auto _ : state) {
198 xnn_qs8_requantize_q31__sse2(
199 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
200 }
201 }
202
BENCHMARK_F(Requantization,q31__ssse3)203 BENCHMARK_F(Requantization, q31__ssse3)(benchmark::State& state) {
204 for (auto _ : state) {
205 xnn_qs8_requantize_q31__ssse3(
206 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
207 }
208 }
209
BENCHMARK_F(Requantization,q31__sse4)210 BENCHMARK_F(Requantization, q31__sse4)(benchmark::State& state) {
211 for (auto _ : state) {
212 xnn_qs8_requantize_q31__sse4(
213 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
214 }
215 }
216 #endif
217
218 #ifndef XNNPACK_BENCHMARK_NO_MAIN
219 BENCHMARK_MAIN();
220 #endif
221