1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <cstdio>
7 #include <cstdlib>
8 #include <cstring>
9 #include <mutex>
10 
11 #ifdef __linux__
12   #include <sched.h>
13 #endif
14 #if defined(__ANDROID__) || defined(_WIN32) || defined(__CYGWIN__)
15   #include <malloc.h>
16 #endif
17 #if defined(__SSE__) || defined(__x86_64__)
18   #include <xmmintrin.h>
19 #endif
20 
21 #include <cpuinfo.h>
22 
23 #include "bench/utils.h"
24 
25 
26 static void* wipe_buffer = nullptr;
27 static size_t wipe_buffer_size = 0;
28 
29 static std::once_flag wipe_buffer_guard;
30 
InitWipeBuffer()31 static void InitWipeBuffer() {
32   // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
33   wipe_buffer_size = 128 * 1024 * 1024;
34   if (cpuinfo_initialize()) {
35     wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
36   }
37 #if defined(_WIN32)
38   wipe_buffer = _aligned_malloc(wipe_buffer_size, 128);
39 #elif defined(__ANDROID__) || defined(__CYGWIN__)
40   // memalign is obsolete, but it is the only option on Android until API level 17.
41   wipe_buffer = memalign(128, wipe_buffer_size);
42 #else
43   (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
44 #endif
45   if (wipe_buffer != nullptr) {
46     memset(wipe_buffer, 0xA5, wipe_buffer_size);
47   }
48 }
49 
50 namespace benchmark {
51 namespace utils {
52 
PrefetchToL1(const void * ptr,size_t size)53 uint32_t PrefetchToL1(const void* ptr, size_t size) {
54   uint32_t step = 16;
55   if (cpuinfo_initialize()) {
56     step = cpuinfo_get_l1d_cache(0)->line_size;
57   }
58   const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
59   // Compute and return sum of data to prevent compiler from removing data reads.
60   uint32_t sum = 0;
61   while (size >= step) {
62     sum += uint32_t(*u8_ptr);
63     u8_ptr += step;
64     size -= step;
65   }
66   return sum;
67 }
68 
WipeCache()69 uint32_t WipeCache() {
70   std::call_once(wipe_buffer_guard, InitWipeBuffer);
71   return PrefetchToL1(wipe_buffer, wipe_buffer_size);
72 }
73 
DisableDenormals()74 void DisableDenormals() {
75 #if defined(__SSE__) || defined(__x86_64__)
76   _mm_setcsr(_mm_getcsr() | 0x8040);
77 #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
78   uint32_t fpscr;
79   #if defined(__thumb__) && !defined(__thumb2__)
80     __asm__ __volatile__(
81         "VMRS %[fpscr], fpscr\n"
82         "ORRS %[fpscr], %[bitmask]\n"
83         "VMSR fpscr, %[fpscr]\n"
84         : [fpscr] "=l" (fpscr)
85         : [bitmask] "l" (0x1000000)
86         : "cc");
87   #else
88     __asm__ __volatile__(
89         "VMRS %[fpscr], fpscr\n"
90         "ORR %[fpscr], #0x1000000\n"
91         "VMSR fpscr, %[fpscr]\n"
92         : [fpscr] "=r" (fpscr));
93   #endif
94 #elif defined(__aarch64__)
95   uint64_t fpcr;
96   __asm__ __volatile__(
97       "MRS %[fpcr], fpcr\n"
98       "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
99       "ORR %w[fpcr], %w[fpcr], 0x80000\n"
100       "MSR fpcr, %[fpcr]\n"
101     : [fpcr] "=r" (fpcr));
102 #endif
103 }
104 
105 // Return clockrate in Hz
GetCurrentCpuFrequency()106 uint64_t GetCurrentCpuFrequency() {
107 #ifdef __linux__
108   int freq = 0;
109   char cpuinfo_name[512];
110   int cpu = sched_getcpu();
111   snprintf(cpuinfo_name, sizeof(cpuinfo_name),
112     "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
113 
114   FILE* f = fopen(cpuinfo_name, "r");
115   if (f) {
116     if (fscanf(f, "%d", &freq)) {
117       fclose(f);
118       return uint64_t(freq) * 1000;
119     }
120     fclose(f);
121   }
122 #endif  // __linux__
123   return 0;
124 }
125 
GetMaxCacheSize()126 size_t GetMaxCacheSize() {
127   if (!cpuinfo_initialize()) {
128     #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
129       // DynamIQ max: 4 MB
130       return 4 * 1024 * 1024;
131     #else
132       // Intel eDRAM max: 128 MB
133       return 128 * 1024 * 1024;
134     #endif
135   }
136   return cpuinfo_get_max_cache_size();
137 }
138 
MultiThreadingParameters(benchmark::internal::Benchmark * benchmark)139 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
140   benchmark->ArgName("T");
141 
142   // Disabled thread pool (execution on the caller thread only).
143   benchmark->Arg(1);
144 
145   if (cpuinfo_initialize()) {
146     // All cores except the little ones.
147     uint32_t max_cores = cpuinfo_get_cores_count();
148     if (cpuinfo_get_clusters_count() > 1) {
149       max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
150     }
151     for (uint32_t t = 2; t <= max_cores; t++) {
152       benchmark->Arg(t);
153     }
154 
155     // All cores (if more than one cluster).
156     if (cpuinfo_get_cores_count() > max_cores) {
157       benchmark->Arg(cpuinfo_get_cores_count());
158     }
159 
160     // All cores + hyperthreads (only if hyperthreading supported).
161     if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
162       benchmark->Arg(cpuinfo_get_processors_count());
163     }
164   }
165 }
166 
167 
CheckVFP(benchmark::State & state)168 bool CheckVFP(benchmark::State& state) {
169   if (!cpuinfo_initialize() || !(cpuinfo_has_arm_vfpv2() || cpuinfo_has_arm_vfpv3())) {
170     state.SkipWithError("no VFP extension");
171     return false;
172   }
173   return true;
174 }
175 
CheckNEONFP16ARITH(benchmark::State & state)176 bool CheckNEONFP16ARITH(benchmark::State& state) {
177   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) {
178     state.SkipWithError("no NEON-FP16-ARITH extension");
179     return false;
180   }
181   return true;
182 }
183 
CheckNEON(benchmark::State & state)184 bool CheckNEON(benchmark::State& state) {
185   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
186     state.SkipWithError("no NEON extension");
187     return false;
188   }
189   return true;
190 }
191 
CheckNEONFMA(benchmark::State & state)192 bool CheckNEONFMA(benchmark::State& state) {
193   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
194     state.SkipWithError("no NEON-FMA extension");
195     return false;
196   }
197   return true;
198 }
199 
CheckNEONDOT(benchmark::State & state)200 bool CheckNEONDOT(benchmark::State& state) {
201   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_dot()) {
202     state.SkipWithError("no NEON-DOT extension");
203     return false;
204   }
205   return true;
206 }
207 
CheckSSSE3(benchmark::State & state)208 bool CheckSSSE3(benchmark::State& state) {
209   if (!cpuinfo_initialize() || !cpuinfo_has_x86_ssse3()) {
210     state.SkipWithError("no SSSE3 extension");
211     return false;
212   }
213   return true;
214 }
215 
CheckSSE41(benchmark::State & state)216 bool CheckSSE41(benchmark::State& state) {
217   if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
218     state.SkipWithError("no SSE4.1 extension");
219     return false;
220   }
221   return true;
222 }
223 
CheckAVX(benchmark::State & state)224 bool CheckAVX(benchmark::State& state) {
225   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
226     state.SkipWithError("no AVX extension");
227     return false;
228   }
229   return true;
230 }
231 
CheckXOP(benchmark::State & state)232 bool CheckXOP(benchmark::State& state) {
233   if (!cpuinfo_initialize() || !cpuinfo_has_x86_xop()) {
234     state.SkipWithError("no XOP extension");
235     return false;
236   }
237   return true;
238 }
239 
CheckFMA3(benchmark::State & state)240 bool CheckFMA3(benchmark::State& state) {
241   if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
242     state.SkipWithError("no FMA3 extension");
243     return false;
244   }
245   return true;
246 }
247 
CheckAVX2(benchmark::State & state)248 bool CheckAVX2(benchmark::State& state) {
249   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
250     state.SkipWithError("no AVX2 extension");
251     return false;
252   }
253   return true;
254 }
255 
CheckAVX512F(benchmark::State & state)256 bool CheckAVX512F(benchmark::State& state) {
257   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
258     state.SkipWithError("no AVX512F extension");
259     return false;
260   }
261   return true;
262 }
263 
CheckAVX512SKX(benchmark::State & state)264 bool CheckAVX512SKX(benchmark::State& state) {
265   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f() ||
266       !cpuinfo_has_x86_avx512cd() || !cpuinfo_has_x86_avx512bw() ||
267       !cpuinfo_has_x86_avx512dq() || !cpuinfo_has_x86_avx512vl())
268   {
269     state.SkipWithError("no AVX512 SKX extensions");
270     return false;
271   }
272   return true;
273 }
274 
275 }  // namespace utils
276 }  // namespace benchmark
277