1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // common.h: contains stuff that's used throughout gemmlowp
16 // and should always be available.
17 
18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
19 #define GEMMLOWP_INTERNAL_COMMON_H_
20 
21 #include "../internal/platform.h"
22 #include "../profiling/pthread_everywhere.h"
23 
24 #include <algorithm>
25 #include <cassert>
26 #include <cmath>
27 #include <cstdlib>
28 
29 #include "../profiling/instrumentation.h"
30 
31 // Our inline assembly path assume GCC/Clang syntax.
32 // Native Client doesn't seem to support inline assembly(?).
33 #if defined(__GNUC__) && !defined(__native_client__)
34 #define GEMMLOWP_ALLOW_INLINE_ASM
35 #endif
36 
37 // Define macro statement that avoids inlining for GCC.
38 // For non-GCC, define as empty macro.
39 #if defined(__GNUC__)
40 #define GEMMLOWP_NOINLINE __attribute__((noinline))
41 #else
42 #define GEMMLOWP_NOINLINE
43 #endif
44 
45 // Detect ARM, 32-bit or 64-bit
46 #ifdef __arm__
47 #define GEMMLOWP_ARM_32
48 #endif
49 
50 #ifdef __aarch64__
51 #define GEMMLOWP_ARM_64
52 #endif
53 
54 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
55 #define GEMMLOWP_ARM
56 #endif
57 
58 // Detect MIPS, 32-bit or 64-bit
59 #if defined(__mips) && !defined(__LP64__)
60 #define GEMMLOWP_MIPS_32
61 #endif
62 
63 #if defined(__mips) && defined(__LP64__)
64 #define GEMMLOWP_MIPS_64
65 #endif
66 
67 #if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64)
68 #define GEMMLOWP_MIPS
69 #endif
70 
71 // Detect x86, 32-bit or 64-bit
72 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
73 #define GEMMLOWP_X86_32
74 #endif
75 
76 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
77 #define GEMMLOWP_X86_64
78 #endif
79 
80 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
81 #define GEMMLOWP_X86
82 #endif
83 
84 // Some of our optimized paths use inline assembly and for
85 // now we don't bother enabling some other optimized paths using intrinddics
86 // where we can't use inline assembly paths.
87 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
88 
89 // Detect NEON. It's important to check for both tokens.
90 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
91 #define GEMMLOWP_NEON
92 #endif
93 
94 // Convenience NEON tokens for 32-bit or 64-bit
95 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
96 #define GEMMLOWP_NEON_32
97 #endif
98 
99 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
100 #define GEMMLOWP_NEON_64
101 #endif
102 
103 // Detect MIPS MSA.
104 // Limit MSA optimizations to little-endian CPUs for now.
105 // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
106 #if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \
107     defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
108 #define GEMMLOWP_MSA
109 #endif
110 
111 // Convenience MIPS MSA tokens for 32-bit or 64-bit.
112 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32)
113 #define GEMMLOWP_MSA_32
114 #endif
115 
116 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64)
117 #define GEMMLOWP_MSA_64
118 #endif
119 
120 // Detect SSE.
121 #ifdef __SSE4_1__
122 #define GEMMLOWP_SSE4
123 #endif
124 
125 #ifdef __SSE3__
126 #define GEMMLOWP_SSE3
127 #endif
128 
129 // Convenience SSE4 tokens for 32-bit or 64-bit
130 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \
131    !defined(GEMMLOWP_DISABLE_SSE4)
132 #define GEMMLOWP_SSE4_32
133 #endif
134 
135 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
136 #define GEMMLOWP_SSE3_32
137 #endif
138 
139 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \
140    !defined(GEMMLOWP_DISABLE_SSE4)
141 #define GEMMLOWP_SSE4_64
142 #endif
143 
144 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
145 #define GEMMLOWP_SSE3_64
146 #endif
147 
148 #if defined(__has_feature)
149 #if __has_feature(memory_sanitizer)
150 #include <sanitizer/msan_interface.h>
151 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
152 #elif __has_feature(address_sanitizer)
153 #include <sanitizer/asan_interface.h>
154 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
155 #endif
156 #endif
157 
158 #endif  // GEMMLOWP_ALLOW_INLINE_ASM
159 
160 // Detect Android. Don't conflate with ARM - we care about tuning
161 // for non-ARM Android devices too. This can be used in conjunction
162 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
163 #if defined(__ANDROID__) || defined(ANDROID)
164 #define GEMMLOWP_ANDROID
165 #endif
166 
167 namespace gemmlowp {
168 
169 // Standard cache line size. Useful to optimize alignment and
170 // prefetches. Ideally we would query this at runtime, however
171 // 64 byte cache lines are the vast majority, and even if it's
172 // wrong on some device, it will be wrong by no more than a 2x factor,
173 // which should be acceptable.
174 const int kDefaultCacheLineSize = 64;
175 
176 // Default L1 and L2 data cache sizes.
177 // The L1 cache size is assumed to be for each core.
178 // The L2 cache size is assumed to be shared among all cores. What
179 // we call 'L2' here is effectively top-level cache.
180 //
181 // On x86, we should ideally query this at
182 // runtime. On ARM, the instruction to query this is privileged and
183 // Android kernels do not expose it to userspace. Fortunately, the majority
184 // of ARM devices have roughly comparable values:
185 //   Nexus 5: L1 16k, L2 1M
186 //   Android One: L1 32k, L2 512k
187 // The following values are equal to or somewhat lower than that, and were
188 // found to perform well on both the Nexus 5 and Android One.
189 // Of course, these values are in principle too low for typical x86 CPUs
190 // where we should set the L2 value to (L3 cache size / number of cores) at
191 // least.
192 //
193 #if defined(GEMMLOWP_ARM) && defined(__APPLE__)
194 // iPhone/iPad
195 const int kDefaultL1CacheSize = 48 * 1024;
196 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
197 #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
198 // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
199 // to tune for ARM, although on x86 Atom we might be able to query
200 // cache sizes at runtime, which would be better.
201 const int kDefaultL1CacheSize = 16 * 1024;
202 const int kDefaultL2CacheSize = 384 * 1024;
203 #elif defined(GEMMLOWP_X86_64)
204 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
205 // Thus we assume larger cache sizes, though we really should query
206 // them at runtime.
207 const int kDefaultL1CacheSize = 32 * 1024;
208 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
209 #elif defined(GEMMLOWP_X86_32)
210 // x86-32 and not Android. Same as x86-64 but less bullish.
211 const int kDefaultL1CacheSize = 32 * 1024;
212 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
213 #elif defined(GEMMLOWP_MIPS)
214 // MIPS and not Android. TODO: MIPS and Android?
215 const int kDefaultL1CacheSize = 32 * 1024;
216 const int kDefaultL2CacheSize = 1024 * 1024;
217 #else
218 // Less common hardware. Maybe some unusual or older or embedded thing.
219 // Assume smaller caches, but don't depart too far from what we do
220 // on ARM/Android to avoid accidentally exposing unexpected behavior.
221 const int kDefaultL1CacheSize = 16 * 1024;
222 const int kDefaultL2CacheSize = 256 * 1024;
223 #endif
224 
225 // The proportion of the cache that we intend to use for storing
226 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
227 // as we typically want to use most of the L2 cache for storing a large
228 // RHS block.
229 #if defined(GEMMLOWP_X86)
230 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
231 // for L2 cache.
232 const float kDefaultL2RhsFactor = 1.00f;
233 #else
234 const float kDefaultL2RhsFactor = 0.75f;
235 #endif
236 
237 // The number of bytes in a SIMD register. This is used to determine
238 // the dimensions of PackingRegisterBlock so that such blocks can
239 // be efficiently loaded into registers, so that packing code can
240 // work within registers as much as possible.
241 // In the non-SIMD generic fallback code, this is just a generic array
242 // size, so any size would work there. Different platforms may set this
243 // to different values but must ensure that their own optimized packing paths
244 // are consistent with this value.
245 const int kRegisterSize = 16;
246 
247 // Hints the CPU to prefetch the cache line containing ptr.
Prefetch(const void * ptr)248 inline void Prefetch(const void* ptr) {
249 #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
250   // Aarch64 has very detailed prefetch instructions, that compilers
251   // can't know how to map __builtin_prefetch to, and as a result, don't,
252   // leaving __builtin_prefetch a no-op on this architecture.
253   // For our purposes, "pldl1keep" is usually what we want, meaning:
254   // "prefetch for load, into L1 cache, using each value multiple times".
255   asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
256 #elif defined \
257     __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
258   __builtin_prefetch(ptr);
259 #else
260   (void)ptr;
261 #endif
262 }
263 
264 // Returns the runtime argument rounded down to the nearest multiple of
265 // the fixed Modulus.
266 template <unsigned Modulus, typename Integer>
RoundDown(Integer i)267 Integer RoundDown(Integer i) {
268   return i - (i % Modulus);
269 }
270 
271 // Returns the runtime argument rounded up to the nearest multiple of
272 // the fixed Modulus.
273 template <unsigned Modulus, typename Integer>
RoundUp(Integer i)274 Integer RoundUp(Integer i) {
275   return RoundDown<Modulus>(i + Modulus - 1);
276 }
277 
278 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
279 template <typename Integer>
CeilQuotient(Integer a,Integer b)280 Integer CeilQuotient(Integer a, Integer b) {
281   return (a + b - 1) / b;
282 }
283 
284 // Returns the argument rounded up to the nearest power of two.
285 template <typename Integer>
RoundUpToPowerOfTwo(Integer n)286 Integer RoundUpToPowerOfTwo(Integer n) {
287   Integer i = n - 1;
288   i |= i >> 1;
289   i |= i >> 2;
290   i |= i >> 4;
291   i |= i >> 8;
292   i |= i >> 16;
293   return i + 1;
294 }
295 
296 template <int N>
297 struct IsPowerOfTwo {
298   static const bool value = !(N & (N - 1));
299 };
300 
301 template <typename T>
MarkMemoryAsInitialized(T * ptr,int size)302 void MarkMemoryAsInitialized(T* ptr, int size) {
303 #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
304   GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
305                                       size * sizeof(T));
306 #else
307   (void)ptr;
308   (void)size;
309 #endif
310 }
311 
312 }  // namespace gemmlowp
313 
314 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
315