1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // common.h: contains stuff that's used throughout gemmlowp
16 // and should always be available.
17 
18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
19 #define GEMMLOWP_INTERNAL_COMMON_H_
20 
21 #include "../internal/platform.h"
22 #include "../profiling/pthread_everywhere.h"
23 
24 #include <algorithm>
25 #include <cassert>
26 #include <cmath>
27 #include <cstdlib>
28 
29 #include "../internal/detect_platform.h"
30 #include "../profiling/instrumentation.h"
31 
32 namespace gemmlowp {
33 
34 // Standard cache line size. Useful to optimize alignment and
35 // prefetches. Ideally we would query this at runtime, however
36 // 64 byte cache lines are the vast majority, and even if it's
37 // wrong on some device, it will be wrong by no more than a 2x factor,
38 // which should be acceptable.
39 const int kDefaultCacheLineSize = 64;
40 
41 // Default L1 and L2 data cache sizes.
42 // The L1 cache size is assumed to be for each core.
43 // The L2 cache size is assumed to be shared among all cores. What
44 // we call 'L2' here is effectively top-level cache.
45 //
46 // On x86, we should ideally query this at
47 // runtime. On ARM, the instruction to query this is privileged and
48 // Android kernels do not expose it to userspace. Fortunately, the majority
49 // of ARM devices have roughly comparable values:
50 //   Nexus 5: L1 16k, L2 1M
51 //   Android One: L1 32k, L2 512k
52 // The following values are equal to or somewhat lower than that, and were
53 // found to perform well on both the Nexus 5 and Android One.
54 // Of course, these values are in principle too low for typical x86 CPUs
55 // where we should set the L2 value to (L3 cache size / number of cores) at
56 // least.
57 //
58 #if defined(GEMMLOWP_ARM) && defined(__APPLE__)
59 // iPhone/iPad
60 const int kDefaultL1CacheSize = 48 * 1024;
61 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
62 #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
63 // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
64 // to tune for ARM, although on x86 Atom we might be able to query
65 // cache sizes at runtime, which would be better.
66 const int kDefaultL1CacheSize = 16 * 1024;
67 const int kDefaultL2CacheSize = 384 * 1024;
68 #elif defined(GEMMLOWP_X86_64)
69 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
70 // Thus we assume larger cache sizes, though we really should query
71 // them at runtime.
72 const int kDefaultL1CacheSize = 32 * 1024;
73 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
74 #elif defined(GEMMLOWP_X86_32)
75 // x86-32 and not Android. Same as x86-64 but less bullish.
76 const int kDefaultL1CacheSize = 32 * 1024;
77 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
78 #elif defined(GEMMLOWP_MIPS)
79 // MIPS and not Android. TODO: MIPS and Android?
80 const int kDefaultL1CacheSize = 32 * 1024;
81 const int kDefaultL2CacheSize = 1024 * 1024;
82 #else
83 // Less common hardware. Maybe some unusual or older or embedded thing.
84 // Assume smaller caches, but don't depart too far from what we do
85 // on ARM/Android to avoid accidentally exposing unexpected behavior.
86 const int kDefaultL1CacheSize = 16 * 1024;
87 const int kDefaultL2CacheSize = 256 * 1024;
88 #endif
89 
90 // The proportion of the cache that we intend to use for storing
91 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
92 // as we typically want to use most of the L2 cache for storing a large
93 // RHS block.
94 #if defined(GEMMLOWP_X86)
95 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
96 // for L2 cache.
97 const float kDefaultL2RhsFactor = 1.00f;
98 #else
99 const float kDefaultL2RhsFactor = 0.75f;
100 #endif
101 
102 // The number of bytes in a SIMD register. This is used to determine
103 // the dimensions of PackingRegisterBlock so that such blocks can
104 // be efficiently loaded into registers, so that packing code can
105 // work within registers as much as possible.
106 // In the non-SIMD generic fallback code, this is just a generic array
107 // size, so any size would work there. Different platforms may set this
108 // to different values but must ensure that their own optimized packing paths
109 // are consistent with this value.
110 
111 #ifdef GEMMLOWP_AVX2
112 const int kRegisterSize = 32;
113 #else
114 const int kRegisterSize = 16;
115 #endif
116 
117 // Hints the CPU to prefetch the cache line containing ptr.
Prefetch(const void * ptr)118 inline void Prefetch(const void* ptr) {
119 #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
120   // Aarch64 has very detailed prefetch instructions, that compilers
121   // can't know how to map __builtin_prefetch to, and as a result, don't,
122   // leaving __builtin_prefetch a no-op on this architecture.
123   // For our purposes, "pldl1keep" is usually what we want, meaning:
124   // "prefetch for load, into L1 cache, using each value multiple times".
125   asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
126 #elif defined \
127     __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
128   __builtin_prefetch(ptr);
129 #else
130   (void)ptr;
131 #endif
132 }
133 
134 // Returns the runtime argument rounded down to the nearest multiple of
135 // the fixed Modulus.
136 template <unsigned Modulus, typename Integer>
RoundDown(Integer i)137 Integer RoundDown(Integer i) {
138   return i - (i % Modulus);
139 }
140 
141 // Returns the runtime argument rounded up to the nearest multiple of
142 // the fixed Modulus.
143 template <unsigned Modulus, typename Integer>
RoundUp(Integer i)144 Integer RoundUp(Integer i) {
145   return RoundDown<Modulus>(i + Modulus - 1);
146 }
147 
148 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
149 template <typename Integer>
CeilQuotient(Integer a,Integer b)150 Integer CeilQuotient(Integer a, Integer b) {
151   return (a + b - 1) / b;
152 }
153 
154 // Returns the argument rounded up to the nearest power of two.
155 template <typename Integer>
RoundUpToPowerOfTwo(Integer n)156 Integer RoundUpToPowerOfTwo(Integer n) {
157   Integer i = n - 1;
158   i |= i >> 1;
159   i |= i >> 2;
160   i |= i >> 4;
161   i |= i >> 8;
162   i |= i >> 16;
163   return i + 1;
164 }
165 
166 template <int N>
167 struct IsPowerOfTwo {
168   static constexpr bool value = !(N & (N - 1));
169 };
170 
171 template <typename T>
MarkMemoryAsInitialized(T * ptr,int size)172 void MarkMemoryAsInitialized(T* ptr, int size) {
173 #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
174   GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
175                                       size * sizeof(T));
176 #else
177   (void)ptr;
178   (void)size;
179 #endif
180 }
181 
182 }  // namespace gemmlowp
183 
184 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
185