1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // instrumentation.h: contains the definitions needed to
16 // instrument code for profiling:
17 //   ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
18 //
19 // profiler.h is only needed to drive the profiler:
20 //   StartProfiling, FinishProfiling.
21 //
22 // See the usage example in profiler.h.
23 
24 #ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
25 #define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
26 
27 #include <pthread.h>
28 #include <cstdio>
29 
30 #ifndef GEMMLOWP_USE_STLPORT
31 #include <cstdint>
32 #else
33 #include <stdint.h>
34 namespace std {
35 using ::uint8_t;
36 using ::uint16_t;
37 using ::uint32_t;
38 using ::int8_t;
39 using ::int16_t;
40 using ::int32_t;
41 using ::size_t;
42 using ::uintptr_t;
43 }
44 #endif
45 
46 #include <algorithm>
47 #include <cassert>
48 #include <cstdlib>
49 
50 #ifdef GEMMLOWP_PROFILING
51 #include <cstring>
52 #include <set>
53 #endif
54 
55 // We should always use C++11 thread_local; unfortunately that
56 // isn't fully supported on Apple yet.
57 #ifdef __APPLE__
58 #define GEMMLOWP_THREAD_LOCAL static __thread
59 #define GEMMLOWP_USING_OLD_THREAD_LOCAL
60 #else
61 #define GEMMLOWP_THREAD_LOCAL thread_local
62 #endif
63 
64 namespace gemmlowp {
65 
ReleaseBuildAssertion(bool condition,const char * msg)66 inline void ReleaseBuildAssertion(bool condition, const char* msg) {
67   if (!condition) {
68     fprintf(stderr, "gemmlowp error: %s\n", msg);
69     abort();
70   }
71 }
72 
73 // To be used as template parameter for GlobalLock.
74 // GlobalLock<ProfilerLockId> is the profiler global lock:
75 // registering threads, starting profiling, finishing profiling, and
76 // the profiler itself as it samples threads, all need to lock it.
77 struct ProfilerLockId;
78 
79 // A very plain global lock. Templated in LockId so we can have multiple
80 // locks, one for each LockId type.
81 template <typename LockId>
82 class GlobalLock {
Mutex()83   static pthread_mutex_t* Mutex() {
84     static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
85     return &m;
86   }
87 
88  public:
Lock()89   static void Lock() { pthread_mutex_lock(Mutex()); }
Unlock()90   static void Unlock() { pthread_mutex_unlock(Mutex()); }
91 };
92 
93 // A very simple RAII helper to lock and unlock a GlobalLock
94 template <typename LockId>
95 struct AutoGlobalLock {
AutoGlobalLockAutoGlobalLock96   AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
~AutoGlobalLockAutoGlobalLock97   ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
98 };
99 
100 // MemoryBarrier is purely a compile-time thing; it tells two things
101 // to the compiler:
102 //   1) It prevents reordering code across it
103 //     (thanks to the 'volatile' after 'asm')
104 //   2) It requires the compiler to assume that any value previously
105 //     read from memory, may have changed. Thus it offers an alternative
106 //     to using 'volatile' variables.
MemoryBarrier()107 inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
108 
109 // Profiling definitions. Two paths: when profiling is enabled,
110 // and when profiling is disabled.
111 #ifdef GEMMLOWP_PROFILING
112 // This code path is when profiling is enabled.
113 
114 // A pseudo-call-stack. Contrary to a real call-stack, this only
115 // contains pointers to literal strings that were manually entered
116 // in the instrumented code (see ScopedProfilingLabel).
117 struct ProfilingStack {
118   static const std::size_t kMaxSize = 15;
119   typedef const char* LabelsArrayType[kMaxSize];
120   LabelsArrayType labels;
121   std::size_t size;
122 
ProfilingStackProfilingStack123   ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
124 
PushProfilingStack125   void Push(const char* label) {
126     MemoryBarrier();
127     ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
128     labels[size] = label;
129     MemoryBarrier();
130     size++;
131     MemoryBarrier();
132   }
133 
PopProfilingStack134   void Pop() {
135     MemoryBarrier();
136     ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
137     size--;
138     MemoryBarrier();
139   }
140 
UpdateTopProfilingStack141   void UpdateTop(const char* new_label) {
142     MemoryBarrier();
143     assert(size);
144     labels[size - 1] = new_label;
145     MemoryBarrier();
146   }
147 
148   ProfilingStack& operator=(const ProfilingStack& other) {
149     memcpy(this, &other, sizeof(ProfilingStack));
150     return *this;
151   }
152 
153   bool operator==(const ProfilingStack& other) const {
154     return !memcmp(this, &other, sizeof(ProfilingStack));
155   }
156 };
157 
158 static_assert(
159     !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
160     "ProfilingStack should have power-of-two size to fit in cache lines");
161 
162 struct ThreadInfo;
163 
164 // The global set of threads being profiled.
ThreadsUnderProfiling()165 inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
166   static std::set<ThreadInfo*> v;
167   return v;
168 }
169 
170 struct ThreadInfo {
171   pthread_key_t key;  // used only to get a callback at thread exit.
172   ProfilingStack stack;
173 
ThreadInfoThreadInfo174   ThreadInfo() {
175     pthread_key_create(&key, ThreadExitCallback);
176     pthread_setspecific(key, this);
177   }
178 
ThreadExitCallbackThreadInfo179   static void ThreadExitCallback(void* ptr) {
180     AutoGlobalLock<ProfilerLockId> lock;
181     ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
182     ThreadsUnderProfiling().erase(self);
183     pthread_key_delete(self->key);
184   }
185 };
186 
ThreadLocalThreadInfo()187 inline ThreadInfo& ThreadLocalThreadInfo() {
188 #ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL
189   // We're leaking this ThreadInfo structure, because Apple doesn't support
190   // non-trivial constructors or destructors for their __thread type modifier.
191   GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr;
192   if (i == nullptr) {
193     i = new ThreadInfo();
194   }
195   return *i;
196 #else
197   GEMMLOWP_THREAD_LOCAL ThreadInfo i;
198   return i;
199 #endif
200 }
201 
202 // ScopedProfilingLabel is how one instruments code for profiling
203 // with this profiler. Construct local ScopedProfilingLabel variables,
204 // passing a literal string describing the local code. Profile
205 // samples will then be annotated with this label, while it is in scope
206 // (whence the name --- also known as RAII).
207 // See the example in profiler.h.
208 class ScopedProfilingLabel {
209   ProfilingStack* profiling_stack_;
210 
211  public:
ScopedProfilingLabel(const char * label)212   explicit ScopedProfilingLabel(const char* label)
213       : profiling_stack_(&ThreadLocalThreadInfo().stack) {
214     profiling_stack_->Push(label);
215   }
216 
~ScopedProfilingLabel()217   ~ScopedProfilingLabel() { profiling_stack_->Pop(); }
218 
Update(const char * new_label)219   void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
220 };
221 
222 // To be called once on each thread to be profiled.
RegisterCurrentThreadForProfiling()223 inline void RegisterCurrentThreadForProfiling() {
224   AutoGlobalLock<ProfilerLockId> lock;
225   ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
226 }
227 
228 #else  // not GEMMLOWP_PROFILING
229 // This code path is when profiling is disabled.
230 
231 // This empty definition of ScopedProfilingLabel ensures that
232 // it has zero runtime overhead when profiling is disabled.
233 struct ScopedProfilingLabel {
ScopedProfilingLabelScopedProfilingLabel234   explicit ScopedProfilingLabel(const char*) {}
UpdateScopedProfilingLabel235   void Update(const char*) {}
236 };
237 
RegisterCurrentThreadForProfiling()238 inline void RegisterCurrentThreadForProfiling() {}
239 
240 #endif
241 
242 }  // end namespace gemmlowp
243 
244 #endif  // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
245