1 //===-- tsan_clock.cc -----------------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file is a part of ThreadSanitizer (TSan), a race detector.
11 //
12 //===----------------------------------------------------------------------===//
13 #include "tsan_clock.h"
14 #include "tsan_rtl.h"
15 #include "sanitizer_common/sanitizer_placement_new.h"
16 
17 // SyncClock and ThreadClock implement vector clocks for sync variables
18 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
19 // ThreadClock contains fixed-size vector clock for maximum number of threads.
20 // SyncClock contains growable vector clock for currently necessary number of
21 // threads.
22 // Together they implement very simple model of operations, namely:
23 //
24 //   void ThreadClock::acquire(const SyncClock *src) {
25 //     for (int i = 0; i < kMaxThreads; i++)
26 //       clock[i] = max(clock[i], src->clock[i]);
27 //   }
28 //
29 //   void ThreadClock::release(SyncClock *dst) const {
30 //     for (int i = 0; i < kMaxThreads; i++)
31 //       dst->clock[i] = max(dst->clock[i], clock[i]);
32 //   }
33 //
34 //   void ThreadClock::ReleaseStore(SyncClock *dst) const {
35 //     for (int i = 0; i < kMaxThreads; i++)
36 //       dst->clock[i] = clock[i];
37 //   }
38 //
39 //   void ThreadClock::acq_rel(SyncClock *dst) {
40 //     acquire(dst);
41 //     release(dst);
42 //   }
43 //
44 // Conformance to this model is extensively verified in tsan_clock_test.cc.
45 // However, the implementation is significantly more complex. The complexity
46 // allows to implement important classes of use cases in O(1) instead of O(N).
47 //
48 // The use cases are:
49 // 1. Singleton/once atomic that has a single release-store operation followed
50 //    by zillions of acquire-loads (the acquire-load is O(1)).
51 // 2. Thread-local mutex (both lock and unlock can be O(1)).
52 // 3. Leaf mutex (unlock is O(1)).
53 // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
54 // 5. An atomic with a single writer (writes can be O(1)).
55 // The implementation dynamically adopts to workload. So if an atomic is in
56 // read-only phase, these reads will be O(1); if it later switches to read/write
57 // phase, the implementation will correctly handle that by switching to O(N).
58 //
59 // Thread-safety note: all const operations on SyncClock's are conducted under
60 // a shared lock; all non-const operations on SyncClock's are conducted under
61 // an exclusive lock; ThreadClock's are private to respective threads and so
62 // do not need any protection.
63 //
64 // Description of ThreadClock state:
65 // clk_ - fixed size vector clock.
66 // nclk_ - effective size of the vector clock (the rest is zeros).
67 // tid_ - index of the thread associated with he clock ("current thread").
68 // last_acquire_ - current thread time when it acquired something from
69 //   other threads.
70 //
71 // Description of SyncClock state:
72 // clk_ - variable size vector clock, low kClkBits hold timestamp,
73 //   the remaining bits hold "acquired" flag (the actual value is thread's
74 //   reused counter);
75 //   if acquried == thr->reused_, then the respective thread has already
76 //   acquired this clock (except possibly dirty_tids_).
77 // dirty_tids_ - holds up to two indeces in the vector clock that other threads
78 //   need to acquire regardless of "acquired" flag value;
79 // release_store_tid_ - denotes that the clock state is a result of
80 //   release-store operation by the thread with release_store_tid_ index.
81 // release_store_reused_ - reuse count of release_store_tid_.
82 
83 // We don't have ThreadState in these methods, so this is an ugly hack that
84 // works only in C++.
85 #ifndef SANITIZER_GO
86 # define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
87 #else
88 # define CPP_STAT_INC(typ) (void)0
89 #endif
90 
91 namespace __tsan {
92 
ThreadClock(unsigned tid,unsigned reused)93 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
94     : tid_(tid)
95     , reused_(reused + 1) {  // 0 has special meaning
96   CHECK_LT(tid, kMaxTidInClock);
97   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
98   nclk_ = tid_ + 1;
99   last_acquire_ = 0;
100   internal_memset(clk_, 0, sizeof(clk_));
101   clk_[tid_].reused = reused_;
102 }
103 
acquire(ClockCache * c,const SyncClock * src)104 void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
105   DCHECK_LE(nclk_, kMaxTid);
106   DCHECK_LE(src->size_, kMaxTid);
107   CPP_STAT_INC(StatClockAcquire);
108 
109   // Check if it's empty -> no need to do anything.
110   const uptr nclk = src->size_;
111   if (nclk == 0) {
112     CPP_STAT_INC(StatClockAcquireEmpty);
113     return;
114   }
115 
116   // Check if we've already acquired src after the last release operation on src
117   bool acquired = false;
118   if (nclk > tid_) {
119     CPP_STAT_INC(StatClockAcquireLarge);
120     if (src->elem(tid_).reused == reused_) {
121       CPP_STAT_INC(StatClockAcquireRepeat);
122       for (unsigned i = 0; i < kDirtyTids; i++) {
123         unsigned tid = src->dirty_tids_[i];
124         if (tid != kInvalidTid) {
125           u64 epoch = src->elem(tid).epoch;
126           if (clk_[tid].epoch < epoch) {
127             clk_[tid].epoch = epoch;
128             acquired = true;
129           }
130         }
131       }
132       if (acquired) {
133         CPP_STAT_INC(StatClockAcquiredSomething);
134         last_acquire_ = clk_[tid_].epoch;
135       }
136       return;
137     }
138   }
139 
140   // O(N) acquire.
141   CPP_STAT_INC(StatClockAcquireFull);
142   nclk_ = max(nclk_, nclk);
143   for (uptr i = 0; i < nclk; i++) {
144     u64 epoch = src->elem(i).epoch;
145     if (clk_[i].epoch < epoch) {
146       clk_[i].epoch = epoch;
147       acquired = true;
148     }
149   }
150 
151   // Remember that this thread has acquired this clock.
152   if (nclk > tid_)
153     src->elem(tid_).reused = reused_;
154 
155   if (acquired) {
156     CPP_STAT_INC(StatClockAcquiredSomething);
157     last_acquire_ = clk_[tid_].epoch;
158   }
159 }
160 
release(ClockCache * c,SyncClock * dst) const161 void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
162   DCHECK_LE(nclk_, kMaxTid);
163   DCHECK_LE(dst->size_, kMaxTid);
164 
165   if (dst->size_ == 0) {
166     // ReleaseStore will correctly set release_store_tid_,
167     // which can be important for future operations.
168     ReleaseStore(c, dst);
169     return;
170   }
171 
172   CPP_STAT_INC(StatClockRelease);
173   // Check if we need to resize dst.
174   if (dst->size_ < nclk_)
175     dst->Resize(c, nclk_);
176 
177   // Check if we had not acquired anything from other threads
178   // since the last release on dst. If so, we need to update
179   // only dst->elem(tid_).
180   if (dst->elem(tid_).epoch > last_acquire_) {
181     UpdateCurrentThread(dst);
182     if (dst->release_store_tid_ != tid_ ||
183         dst->release_store_reused_ != reused_)
184       dst->release_store_tid_ = kInvalidTid;
185     return;
186   }
187 
188   // O(N) release.
189   CPP_STAT_INC(StatClockReleaseFull);
190   // First, remember whether we've acquired dst.
191   bool acquired = IsAlreadyAcquired(dst);
192   if (acquired)
193     CPP_STAT_INC(StatClockReleaseAcquired);
194   // Update dst->clk_.
195   for (uptr i = 0; i < nclk_; i++) {
196     ClockElem &ce = dst->elem(i);
197     ce.epoch = max(ce.epoch, clk_[i].epoch);
198     ce.reused = 0;
199   }
200   // Clear 'acquired' flag in the remaining elements.
201   if (nclk_ < dst->size_)
202     CPP_STAT_INC(StatClockReleaseClearTail);
203   for (uptr i = nclk_; i < dst->size_; i++)
204     dst->elem(i).reused = 0;
205   for (unsigned i = 0; i < kDirtyTids; i++)
206     dst->dirty_tids_[i] = kInvalidTid;
207   dst->release_store_tid_ = kInvalidTid;
208   dst->release_store_reused_ = 0;
209   // If we've acquired dst, remember this fact,
210   // so that we don't need to acquire it on next acquire.
211   if (acquired)
212     dst->elem(tid_).reused = reused_;
213 }
214 
ReleaseStore(ClockCache * c,SyncClock * dst) const215 void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
216   DCHECK_LE(nclk_, kMaxTid);
217   DCHECK_LE(dst->size_, kMaxTid);
218   CPP_STAT_INC(StatClockStore);
219 
220   // Check if we need to resize dst.
221   if (dst->size_ < nclk_)
222     dst->Resize(c, nclk_);
223 
224   if (dst->release_store_tid_ == tid_ &&
225       dst->release_store_reused_ == reused_ &&
226       dst->elem(tid_).epoch > last_acquire_) {
227     CPP_STAT_INC(StatClockStoreFast);
228     UpdateCurrentThread(dst);
229     return;
230   }
231 
232   // O(N) release-store.
233   CPP_STAT_INC(StatClockStoreFull);
234   for (uptr i = 0; i < nclk_; i++) {
235     ClockElem &ce = dst->elem(i);
236     ce.epoch = clk_[i].epoch;
237     ce.reused = 0;
238   }
239   // Clear the tail of dst->clk_.
240   if (nclk_ < dst->size_) {
241     for (uptr i = nclk_; i < dst->size_; i++) {
242       ClockElem &ce = dst->elem(i);
243       ce.epoch = 0;
244       ce.reused = 0;
245     }
246     CPP_STAT_INC(StatClockStoreTail);
247   }
248   for (unsigned i = 0; i < kDirtyTids; i++)
249     dst->dirty_tids_[i] = kInvalidTid;
250   dst->release_store_tid_ = tid_;
251   dst->release_store_reused_ = reused_;
252   // Rememeber that we don't need to acquire it in future.
253   dst->elem(tid_).reused = reused_;
254 }
255 
acq_rel(ClockCache * c,SyncClock * dst)256 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
257   CPP_STAT_INC(StatClockAcquireRelease);
258   acquire(c, dst);
259   ReleaseStore(c, dst);
260 }
261 
262 // Updates only single element related to the current thread in dst->clk_.
UpdateCurrentThread(SyncClock * dst) const263 void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
264   // Update the threads time, but preserve 'acquired' flag.
265   dst->elem(tid_).epoch = clk_[tid_].epoch;
266 
267   for (unsigned i = 0; i < kDirtyTids; i++) {
268     if (dst->dirty_tids_[i] == tid_) {
269       CPP_STAT_INC(StatClockReleaseFast1);
270       return;
271     }
272     if (dst->dirty_tids_[i] == kInvalidTid) {
273       CPP_STAT_INC(StatClockReleaseFast2);
274       dst->dirty_tids_[i] = tid_;
275       return;
276     }
277   }
278   // Reset all 'acquired' flags, O(N).
279   CPP_STAT_INC(StatClockReleaseSlow);
280   for (uptr i = 0; i < dst->size_; i++)
281     dst->elem(i).reused = 0;
282   for (unsigned i = 0; i < kDirtyTids; i++)
283     dst->dirty_tids_[i] = kInvalidTid;
284 }
285 
286 // Checks whether the current threads has already acquired src.
IsAlreadyAcquired(const SyncClock * src) const287 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
288   if (src->elem(tid_).reused != reused_)
289     return false;
290   for (unsigned i = 0; i < kDirtyTids; i++) {
291     unsigned tid = src->dirty_tids_[i];
292     if (tid != kInvalidTid) {
293       if (clk_[tid].epoch < src->elem(tid).epoch)
294         return false;
295     }
296   }
297   return true;
298 }
299 
Resize(ClockCache * c,uptr nclk)300 void SyncClock::Resize(ClockCache *c, uptr nclk) {
301   CPP_STAT_INC(StatClockReleaseResize);
302   if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
303       RoundUpTo(size_, ClockBlock::kClockCount)) {
304     // Growing within the same block.
305     // Memory is already allocated, just increase the size.
306     size_ = nclk;
307     return;
308   }
309   if (nclk <= ClockBlock::kClockCount) {
310     // Grow from 0 to one-level table.
311     CHECK_EQ(size_, 0);
312     CHECK_EQ(tab_, 0);
313     CHECK_EQ(tab_idx_, 0);
314     size_ = nclk;
315     tab_idx_ = ctx->clock_alloc.Alloc(c);
316     tab_ = ctx->clock_alloc.Map(tab_idx_);
317     internal_memset(tab_, 0, sizeof(*tab_));
318     return;
319   }
320   // Growing two-level table.
321   if (size_ == 0) {
322     // Allocate first level table.
323     tab_idx_ = ctx->clock_alloc.Alloc(c);
324     tab_ = ctx->clock_alloc.Map(tab_idx_);
325     internal_memset(tab_, 0, sizeof(*tab_));
326   } else if (size_ <= ClockBlock::kClockCount) {
327     // Transform one-level table to two-level table.
328     u32 old = tab_idx_;
329     tab_idx_ = ctx->clock_alloc.Alloc(c);
330     tab_ = ctx->clock_alloc.Map(tab_idx_);
331     internal_memset(tab_, 0, sizeof(*tab_));
332     tab_->table[0] = old;
333   }
334   // At this point we have first level table allocated.
335   // Add second level tables as necessary.
336   for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
337       i < nclk; i += ClockBlock::kClockCount) {
338     u32 idx = ctx->clock_alloc.Alloc(c);
339     ClockBlock *cb = ctx->clock_alloc.Map(idx);
340     internal_memset(cb, 0, sizeof(*cb));
341     CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
342     tab_->table[i/ClockBlock::kClockCount] = idx;
343   }
344   size_ = nclk;
345 }
346 
347 // Sets a single element in the vector clock.
348 // This function is called only from weird places like AcquireGlobal.
set(unsigned tid,u64 v)349 void ThreadClock::set(unsigned tid, u64 v) {
350   DCHECK_LT(tid, kMaxTid);
351   DCHECK_GE(v, clk_[tid].epoch);
352   clk_[tid].epoch = v;
353   if (nclk_ <= tid)
354     nclk_ = tid + 1;
355   last_acquire_ = clk_[tid_].epoch;
356 }
357 
DebugDump(int (* printf)(const char * s,...))358 void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
359   printf("clock=[");
360   for (uptr i = 0; i < nclk_; i++)
361     printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
362   printf("] reused=[");
363   for (uptr i = 0; i < nclk_; i++)
364     printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
365   printf("] tid=%u/%u last_acq=%llu",
366       tid_, reused_, last_acquire_);
367 }
368 
SyncClock()369 SyncClock::SyncClock()
370     : release_store_tid_(kInvalidTid)
371     , release_store_reused_()
372     , tab_()
373     , tab_idx_()
374     , size_() {
375   for (uptr i = 0; i < kDirtyTids; i++)
376     dirty_tids_[i] = kInvalidTid;
377 }
378 
~SyncClock()379 SyncClock::~SyncClock() {
380   // Reset must be called before dtor.
381   CHECK_EQ(size_, 0);
382   CHECK_EQ(tab_, 0);
383   CHECK_EQ(tab_idx_, 0);
384 }
385 
Reset(ClockCache * c)386 void SyncClock::Reset(ClockCache *c) {
387   if (size_ == 0) {
388     // nothing
389   } else if (size_ <= ClockBlock::kClockCount) {
390     // One-level table.
391     ctx->clock_alloc.Free(c, tab_idx_);
392   } else {
393     // Two-level table.
394     for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
395       ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
396     ctx->clock_alloc.Free(c, tab_idx_);
397   }
398   tab_ = 0;
399   tab_idx_ = 0;
400   size_ = 0;
401   release_store_tid_ = kInvalidTid;
402   release_store_reused_ = 0;
403   for (uptr i = 0; i < kDirtyTids; i++)
404     dirty_tids_[i] = kInvalidTid;
405 }
406 
elem(unsigned tid) const407 ClockElem &SyncClock::elem(unsigned tid) const {
408   DCHECK_LT(tid, size_);
409   if (size_ <= ClockBlock::kClockCount)
410     return tab_->clock[tid];
411   u32 idx = tab_->table[tid / ClockBlock::kClockCount];
412   ClockBlock *cb = ctx->clock_alloc.Map(idx);
413   return cb->clock[tid % ClockBlock::kClockCount];
414 }
415 
DebugDump(int (* printf)(const char * s,...))416 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
417   printf("clock=[");
418   for (uptr i = 0; i < size_; i++)
419     printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
420   printf("] reused=[");
421   for (uptr i = 0; i < size_; i++)
422     printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
423   printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
424       release_store_tid_, release_store_reused_,
425       dirty_tids_[0], dirty_tids_[1]);
426 }
427 }  // namespace __tsan
428