1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // common.h: contains stuff that's used throughout gemmlowp
16 // and should always be available.
17 
18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
19 #define GEMMLOWP_INTERNAL_COMMON_H_
20 
21 #include <pthread.h>
22 
23 #include <algorithm>
24 #include <cassert>
25 #include <cmath>
26 #include <cstdlib>
27 
28 #include "../profiling/instrumentation.h"
29 
30 // Our inline assembly path assume GCC/Clang syntax.
31 // Native Client doesn't seem to support inline assembly(?).
32 #if defined(__GNUC__) && !defined(__native_client__)
33 #define GEMMLOWP_ALLOW_INLINE_ASM
34 #endif
35 
36 // Define macro statement that avoids inlining for GCC.
37 // For non-GCC, define as empty macro.
38 #if defined(__GNUC__)
39 #define GEMMLOWP_NOINLINE __attribute__((noinline))
40 #else
41 #define GEMMLOWP_NOINLINE
42 #endif
43 
44 // Detect ARM, 32-bit or 64-bit
45 #ifdef __arm__
46 #define GEMMLOWP_ARM_32
47 #endif
48 
49 #ifdef __aarch64__
50 #define GEMMLOWP_ARM_64
51 #endif
52 
53 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
54 #define GEMMLOWP_ARM
55 #endif
56 
57 // Detect x86, 32-bit or 64-bit
58 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
59 #define GEMMLOWP_X86_32
60 #endif
61 
62 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
63 #define GEMMLOWP_X86_64
64 #endif
65 
66 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
67 #define GEMMLOWP_X86
68 #endif
69 
70 // Some of our optimized paths use inline assembly and for
71 // now we don't bother enabling some other optimized paths using intrinddics
72 // where we can't use inline assembly paths.
73 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
74 
75 // Detect NEON. It's important to check for both tokens.
76 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
77 #define GEMMLOWP_NEON
78 #endif
79 
80 // Convenience NEON tokens for 32-bit or 64-bit
81 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
82 #define GEMMLOWP_NEON_32
83 #endif
84 
85 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
86 #define GEMMLOWP_NEON_64
87 #endif
88 
89 // Detect SSE4.
90 #if defined __SSE4_1__
91 #define GEMMLOWP_SSE4
92 #endif
93 
94 // Convenience SSE4 tokens for 32-bit or 64-bit
95 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32)
96 #define GEMMLOWP_SSE4_32
97 #endif
98 
99 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64)
100 #define GEMMLOWP_SSE4_64
101 #endif
102 
103 #endif  // GEMMLOWP_ALLOW_INLINE_ASM
104 
105 // Detect Android. Don't conflate with ARM - we care about tuning
106 // for non-ARM Android devices too. This can be used in conjunction
107 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
108 #if defined(__ANDROID__) || defined(ANDROID)
109 #define GEMMLOWP_ANDROID
110 #endif
111 
112 namespace gemmlowp {
113 
114 // Standard cache line size. Useful to optimize alignment and
115 // prefetches. Ideally we would query this at runtime, however
116 // 64 byte cache lines are the vast majority, and even if it's
117 // wrong on some device, it will be wrong by no more than a 2x factor,
118 // which should be acceptable.
119 const int kDefaultCacheLineSize = 64;
120 
121 // Default L1 and L2 data cache sizes.
122 // The L1 cache size is assumed to be for each core.
123 // The L2 cache size is assumed to be shared among all cores. What
124 // we call 'L2' here is effectively top-level cache.
125 //
126 // On x86, we should ideally query this at
127 // runtime. On ARM, the instruction to query this is privileged and
128 // Android kernels do not expose it to userspace. Fortunately, the majority
129 // of ARM devices have roughly comparable values:
130 //   Nexus 5: L1 16k, L2 1M
131 //   Android One: L1 32k, L2 512k
132 // The following values are equal to or somewhat lower than that, and were
133 // found to perform well on both the Nexus 5 and Android One.
134 // Of course, these values are in principle too low for typical x86 CPUs
135 // where we should set the L2 value to (L3 cache size / number of cores) at
136 // least.
137 #if defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
138 // ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
139 // to tune for ARM, although on x86 Atom we might be able to query
140 // cache sizes at runtime, which would be better.
141 const int kDefaultL1CacheSize = 16 * 1024;
142 const int kDefaultL2CacheSize = 384 * 1024;
143 #elif defined(GEMMLOWP_X86_64)
144 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
145 // Thus we assume larger cache sizes, though we really should query
146 // them at runtime.
147 const int kDefaultL1CacheSize = 32 * 1024;
148 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
149 #elif defined(GEMMLOWP_X86_32)
150 // x86-32 and not Android. Same as x86-64 but less bullish.
151 const int kDefaultL1CacheSize = 32 * 1024;
152 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
153 #else
154 // Less common hardware. Maybe some unusual or older or embedded thing.
155 // Assume smaller caches, but don't depart too far from what we do
156 // on ARM/Android to avoid accidentally exposing unexpected behavior.
157 const int kDefaultL1CacheSize = 16 * 1024;
158 const int kDefaultL2CacheSize = 256 * 1024;
159 #endif
160 
161 // The proportion of the cache that we intend to use for storing
162 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
163 // as we typically want to use most of the L2 cache for storing a large
164 // RHS block.
165 #if defined(GEMMLOWP_X86)
166 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
167 // for L2 cache.
168 const float kDefaultL2RhsFactor = 1.00f;
169 #else
170 const float kDefaultL2RhsFactor = 0.75f;
171 #endif
172 
173 // The number of bytes in a SIMD register. This is used to determine
174 // the dimensions of PackingRegisterBlock so that such blocks can
175 // be efficiently loaded into registers, so that packing code can
176 // work within registers as much as possible.
177 // In the non-SIMD generic fallback code, this is just a generic array
178 // size, so any size would work there. Different platforms may set this
179 // to different values but must ensure that their own optimized packing paths
180 // are consistent with this value.
181 const int kRegisterSize = 16;
182 
183 // Requantization to less-than-8-bit is costly, so it only worth
184 // doing if the GEMM width is large enough
185 const int kMinimumWidthForRequantization = 100;
186 
187 // Hints the CPU to prefetch the cache line containing ptr.
Prefetch(const void * ptr)188 inline void Prefetch(const void* ptr) {
189 #ifdef __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
190   __builtin_prefetch(ptr);
191 #else
192   (void)ptr;
193 #endif
194 }
195 
196 // Returns the runtime argument rounded down to the nearest multiple of
197 // the fixed Modulus.
198 template <unsigned Modulus, typename Integer>
RoundDown(Integer i)199 Integer RoundDown(Integer i) {
200   return i - (i % Modulus);
201 }
202 
203 // Returns the runtime argument rounded up to the nearest multiple of
204 // the fixed Modulus.
205 template <unsigned Modulus, typename Integer>
RoundUp(Integer i)206 Integer RoundUp(Integer i) {
207   return RoundDown<Modulus>(i + Modulus - 1);
208 }
209 
210 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
211 template <typename Integer>
CeilQuotient(Integer a,Integer b)212 Integer CeilQuotient(Integer a, Integer b) {
213   return (a + b - 1) / b;
214 }
215 
216 // Returns the argument rounded up to the nearest power of two.
217 template <typename Integer>
RoundUpToPowerOfTwo(Integer n)218 Integer RoundUpToPowerOfTwo(Integer n) {
219   Integer i = n - 1;
220   i |= i >> 1;
221   i |= i >> 2;
222   i |= i >> 4;
223   i |= i >> 8;
224   i |= i >> 16;
225   return i + 1;
226 }
227 
228 template <int N>
229 struct IsPowerOfTwo {
230   static const bool value = !(N & (N - 1));
231 };
232 
233 }  // namespace gemmlowp
234 
235 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
236