1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
17 #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
18 
19 #include <array>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <vector>
24 
25 #include "tensorflow/core/common_runtime/allocator_retry.h"
26 #include "tensorflow/core/common_runtime/shared_counter.h"
27 #include "tensorflow/core/framework/allocator.h"
28 #include "tensorflow/core/lib/gtl/stl_util.h"
29 #include "tensorflow/core/lib/strings/strcat.h"
30 #include "tensorflow/core/platform/macros.h"
31 #include "tensorflow/core/platform/mutex.h"
32 #include "tensorflow/core/platform/thread_annotations.h"
33 #include "tensorflow/core/platform/types.h"
34 #include "tensorflow/core/protobuf/config.pb.h"
35 
36 namespace tensorflow {
37 
38 // A memory allocator that implements a 'best-fit with coalescing'
39 // algorithm.  This is essentially a very simple version of Doug Lea's
40 // malloc (dlmalloc).
41 //
42 // The goal of this allocator is to support defragmentation via
43 // coalescing.  One assumption we make is that the process using this
44 // allocator owns pretty much all of the memory, and that nearly
45 // all requests to allocate memory go through this interface.
46 class BFCAllocator : public Allocator {
47  public:
48   // Takes ownership of sub_allocator.
49   BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
50                bool allow_growth, const string& name);
51   ~BFCAllocator() override;
52 
Name()53   string Name() override { return name_; }
54 
AllocateRaw(size_t alignment,size_t num_bytes)55   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
56     return AllocateRaw(alignment, num_bytes, AllocationAttributes());
57   }
58 
59   void* AllocateRaw(size_t alignment, size_t num_bytes,
60                     const AllocationAttributes& allocation_attr) override;
61 
62   void DeallocateRaw(void* ptr) override;
63 
64   bool TracksAllocationSizes() override;
65 
66   size_t RequestedSize(const void* ptr) override;
67 
68   size_t AllocatedSize(const void* ptr) override;
69 
70   int64 AllocationId(const void* ptr) override;
71 
72   absl::optional<AllocatorStats> GetStats() override;
73 
74   void ClearStats() override;
75 
SetTimingCounter(SharedCounter * sc)76   void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
77 
78  private:
79   struct Bin;
80 
81   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
82                             bool dump_log_on_failure,
83                             uint64 freed_before_count);
84 
85   void* AllocateRawInternalWithRetry(
86       size_t alignment, size_t num_bytes,
87       const AllocationAttributes& allocation_attr);
88 
89   void DeallocateRawInternal(void* ptr);
90 
91   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
92   // kInvalidChunkHandle means an invalid chunk
93   typedef size_t ChunkHandle;
94   static const int kInvalidChunkHandle = -1;
95 
96   typedef int BinNum;
97   static const int kInvalidBinNum = -1;
98   static const int kNumBins = 21;
99 
100   // A Chunk points to a piece of memory that's either entirely free or entirely
101   // in use by one user memory allocation.
102   //
103   // An AllocationRegion's memory is split up into one or more disjoint Chunks,
104   // which together cover the whole region without gaps.  Chunks participate in
105   // a doubly-linked list, and the prev/next pointers point to the physically
106   // adjacent chunks.
107   //
108   // Since a chunk cannot be partially in use, we may need to split a free chunk
109   // in order to service a user allocation.  We always merge adjacent free
110   // chunks.
111   //
112   // Chunks contain information about whether they are in use or whether they
113   // are free, and contain a pointer to the bin they are in.
114   struct Chunk {
115     size_t size = 0;  // Full size of buffer.
116 
117     // We sometimes give chunks that are larger than needed to reduce
118     // fragmentation.  requested_size keeps track of what the client
119     // actually wanted so we can understand whether our splitting
120     // strategy is efficient.
121     size_t requested_size = 0;
122 
123     // allocation_id is set to -1 when the chunk is not in use. It is assigned a
124     // value greater than zero before the chunk is returned from
125     // AllocateRaw, and this value is unique among values assigned by
126     // the parent allocator.
127     int64 allocation_id = -1;
128     void* ptr = nullptr;  // pointer to granted subbuffer.
129 
130     // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
131     // preceding the memory used by this chunk.  E.g., It should start
132     // at 'ptr - prev->size'
133     ChunkHandle prev = kInvalidChunkHandle;
134 
135     // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
136     // following the memory used by this chunk.  E.g., It should be at
137     // 'ptr + size'
138     ChunkHandle next = kInvalidChunkHandle;
139 
140     // What bin are we in?
141     BinNum bin_num = kInvalidBinNum;
142 
143     // Optional count when this chunk was most recently made free.
144     uint64 freed_count = 0;
145 
in_useChunk146     bool in_use() const { return allocation_id != -1; }
147 
DebugStringChunk148     string DebugString(BFCAllocator* a,
149                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
150       string dbg;
151       strings::StrAppend(
152           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
153           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
154           " | in_use: ", in_use());
155       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
156         Chunk* p = a->ChunkFromHandle(prev);
157         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
158       }
159       if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
160         Chunk* n = a->ChunkFromHandle(next);
161         strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
162       }
163       return dbg;
164     }
165   };
166 
167   // A Bin is a collection of similar-sized free chunks.
168   struct Bin {
169     // All chunks in this bin have >= bin_size memory.
170     size_t bin_size = 0;
171 
172     struct ChunkComparator {
ChunkComparatorBin::ChunkComparator173       explicit ChunkComparator(BFCAllocator* allocator)
174           : allocator_(allocator) {}
175       // Sort first by size and then use pointer address as a tie breaker.
operatorBin::ChunkComparator176       bool operator()(const ChunkHandle ha,
177                       const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS {
178         const Chunk* a = allocator_->ChunkFromHandle(ha);
179         const Chunk* b = allocator_->ChunkFromHandle(hb);
180         if (a->size != b->size) {
181           return a->size < b->size;
182         }
183         return a->ptr < b->ptr;
184       }
185 
186      private:
187       BFCAllocator* allocator_;  // The parent allocator
188     };
189 
190     typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
191     // List of free chunks within the bin, sorted by chunk size.
192     // Chunk * not owned.
193     FreeChunkSet free_chunks;
BinBin194     Bin(BFCAllocator* allocator, size_t bs)
195         : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
196   };
197 
198   static const size_t kMinAllocationBits = 8;
199   static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
200 
201   // BFCAllocator allocates memory into a collection of disjoint
202   // AllocationRegions.  Each AllocationRegion corresponds to one call to
203   // SubAllocator::Alloc().
204   //
205   // An AllocationRegion contains one or more Chunks, covering all of its
206   // memory.  Its primary job is to map a pointers to ChunkHandles.
207   //
208   // This class is thread-compatible.
209   class AllocationRegion {
210    public:
AllocationRegion(void * ptr,size_t memory_size)211     AllocationRegion(void* ptr, size_t memory_size)
212         : ptr_(ptr),
213           memory_size_(memory_size),
214           end_ptr_(
215               static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
216       DCHECK_EQ(0, memory_size % kMinAllocationSize);
217       const size_t n_handles =
218           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
219       handles_.reset(new ChunkHandle[n_handles]);
220       for (size_t i = 0; i < n_handles; i++) {
221         handles_[i] = kInvalidChunkHandle;
222       }
223     }
224 
225     AllocationRegion() = default;
AllocationRegion(AllocationRegion && other)226     AllocationRegion(AllocationRegion&& other) { Swap(other); }
227     AllocationRegion& operator=(AllocationRegion&& other) {
228       Swap(other);
229       return *this;
230     }
231 
ptr()232     void* ptr() const { return ptr_; }
end_ptr()233     void* end_ptr() const { return end_ptr_; }
memory_size()234     size_t memory_size() const { return memory_size_; }
get_handle(const void * p)235     ChunkHandle get_handle(const void* p) const {
236       return handles_[IndexFor(p)];
237     }
set_handle(const void * p,ChunkHandle h)238     void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
erase(const void * p)239     void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
240 
241    private:
Swap(AllocationRegion & other)242     void Swap(AllocationRegion& other) {
243       std::swap(ptr_, other.ptr_);
244       std::swap(memory_size_, other.memory_size_);
245       std::swap(end_ptr_, other.end_ptr_);
246       std::swap(handles_, other.handles_);
247     }
248 
IndexFor(const void * p)249     int IndexFor(const void* p) const {
250       std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
251       std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
252       DCHECK_GE(p_int, base_int);
253       DCHECK_LT(p_int, base_int + memory_size_);
254       return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
255     }
256 
257     // Metadata about the allocation region.
258     void* ptr_ = nullptr;
259     size_t memory_size_ = 0;
260     void* end_ptr_ = nullptr;
261 
262     // Array of size "memory_size / kMinAllocationSize".  It is
263     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
264     // for the memory allocation represented by "p"
265     std::unique_ptr<ChunkHandle[]> handles_;
266 
267     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
268   };
269 
270   // RegionManager aggregates one or more "AllocationRegions" and provides
271   // a layer of indirection from pointers to the underlying ChunkHandle,
272   // allowing allocation across multiple discontiguous memory regions.
273   //
274   // This class is thread-compatible.
275   class RegionManager {
276    public:
RegionManager()277     RegionManager() {}
~RegionManager()278     ~RegionManager() {}
279 
AddAllocationRegion(void * ptr,size_t memory_size)280     void AddAllocationRegion(void* ptr, size_t memory_size) {
281       // Insert sorted by end_ptr
282       auto entry =
283           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
284       regions_.insert(entry, AllocationRegion(ptr, memory_size));
285     }
286 
get_handle(const void * p)287     ChunkHandle get_handle(const void* p) const {
288       return RegionFor(p)->get_handle(p);
289     }
290 
set_handle(const void * p,ChunkHandle h)291     void set_handle(const void* p, ChunkHandle h) {
292       return MutableRegionFor(p)->set_handle(p, h);
293     }
erase(const void * p)294     void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
295 
regions()296     const std::vector<AllocationRegion>& regions() const { return regions_; }
297 
298    private:
Comparator(const void * ptr,const AllocationRegion & other)299     static bool Comparator(const void* ptr, const AllocationRegion& other) {
300       return ptr < other.end_ptr();
301     }
302 
MutableRegionFor(const void * p)303     AllocationRegion* MutableRegionFor(const void* p) {
304       return const_cast<AllocationRegion*>(RegionFor(p));
305     }
306 
RegionFor(const void * p)307     const AllocationRegion* RegionFor(const void* p) const {
308       auto entry =
309           std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
310 
311       if (entry != regions_.end()) {
312         return &(*entry);
313       }
314 
315       LOG(FATAL) << "Could not find Region for " << p;
316       return nullptr;
317     }
318 
319    private:
320     std::vector<AllocationRegion> regions_;
321   };
322 
323   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
324   static size_t RoundedBytes(size_t bytes);
325 
326   // Try to add a new memory region that can satisfy an allocation of
327   // 'rounded_bytes' bytes.  Returns true on success and false on
328   // failure.
329   bool Extend(size_t alignment, size_t rounded_bytes)
330       EXCLUSIVE_LOCKS_REQUIRED(lock_);
331 
332   // Returns a pointer to an underlying allocated chunk of size
333   // 'rounded_bytes'.
334   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
335                      uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
336 
337   // Splits the chunk specified by 'h' into two chunks, one at least
338   // of size 'num_bytes'.
339   void SplitChunk(ChunkHandle h, size_t num_bytes)
340       EXCLUSIVE_LOCKS_REQUIRED(lock_);
341 
342   // Merges the two chunk handles.  Requires that the chunks are
343   // contiguous in their allocation.
344   void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
345 
346   // Frees the memory represented by 'h', coalescing the chunk if
347   // possible.
348   void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
349 
350   // Adds the chunk 'h' to the proper free bin.
351   void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
352 
353   // Removes the free chunk pointed to by 'c' from the set free_chunks.
354   void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
355                                   const Bin::FreeChunkSet::iterator& c)
356       EXCLUSIVE_LOCKS_REQUIRED(lock_);
357 
358   // Removes a free chunk from the bin.
359   void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
360 
361   // Removes the chunk metadata represented by 'h'.
362   void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
363 
364   string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
365   void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
366 
367   ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
368   void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
369 
370   Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
371 
372   // Information about a Bin that is useful for debugging.
373   struct BinDebugInfo {
374     size_t total_bytes_in_use = 0;
375     size_t total_bytes_in_bin = 0;
376     size_t total_requested_bytes_in_use = 0;
377     size_t total_chunks_in_use = 0;
378     size_t total_chunks_in_bin = 0;
379   };
380 
381   // Computes and returns a BinDebugInfo for each Bin.
382   std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
383       EXCLUSIVE_LOCKS_REQUIRED(lock_);
384 
385   AllocatorRetry retry_helper_;
386 
387   // Structures immutable after construction
388   size_t memory_limit_ = 0;
389 
Log2FloorNonZeroSlow(uint64 n)390   inline int Log2FloorNonZeroSlow(uint64 n) {
391     int r = 0;
392     while (n > 0) {
393       r++;
394       n >>= 1;
395     }
396     return r - 1;
397   }
398 
399   // Returns floor(log2(n)).
Log2FloorNonZero(uint64 n)400   inline int Log2FloorNonZero(uint64 n) {
401 #if defined(__GNUC__)
402     return 63 ^ __builtin_clzll(n);
403 #elif defined(PLATFORM_WINDOWS) && (_WIN64)
404     unsigned long index;
405     _BitScanReverse64(&index, n);
406     return index;
407 #else
408     return Log2FloorNonZeroSlow(n);
409 #endif
410   }
411 
412   // Map from bin size to Bin
BinFromIndex(BinNum index)413   Bin* BinFromIndex(BinNum index) {
414     return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
415   }
BinNumToSize(BinNum index)416   size_t BinNumToSize(BinNum index) {
417     return static_cast<size_t>(256) << index;
418   }
BinNumForSize(size_t bytes)419   BinNum BinNumForSize(size_t bytes) {
420     uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
421     int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
422     return b;
423   }
BinForSize(size_t bytes)424   Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
425 
426   char bins_space_[sizeof(Bin) * kNumBins];
427 
428   // The size of the current region allocation.
429   size_t curr_region_allocation_bytes_;
430 
431   // The total number of allocated bytes by the allocator.
432   size_t total_region_allocated_bytes_ = 0;
433 
434   // An indicator that expansion of a region has hit the limits
435   // of the available memory.
436   bool started_backpedal_ = false;
437 
438   std::unique_ptr<SubAllocator> sub_allocator_;
439   string name_;
440   SharedCounter* timing_counter_ = nullptr;
441 
442   // Structures mutable after construction
443   mutable mutex lock_;
444   RegionManager region_manager_ GUARDED_BY(lock_);
445 
446   std::vector<Chunk> chunks_ GUARDED_BY(lock_);
447 
448   // Pointer to head of linked list of free Chunks
449   ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
450 
451   // Counter containing the next unique identifier to assign to a
452   // newly-created chunk.
453   int64 next_allocation_id_ GUARDED_BY(lock_);
454 
455   // Stats.
456   AllocatorStats stats_ GUARDED_BY(lock_);
457 
458   friend class GPUBFCAllocatorPrivateMethodsTest;
459   TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
460 };
461 
462 }  // namespace tensorflow
463 
464 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
465