1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/bfc_allocator.h"
17 
18 #include <atomic>
19 
20 #include "absl/strings/string_view.h"
21 #include "tensorflow/core/common_runtime/allocator_retry.h"
22 #include "tensorflow/core/lib/core/bits.h"
23 #include "tensorflow/core/lib/strings/numbers.h"
24 #include "tensorflow/core/lib/strings/str_util.h"
25 #include "tensorflow/core/lib/strings/strcat.h"
26 #include "tensorflow/core/platform/file_system.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #include "tensorflow/core/platform/stacktrace.h"
30 #include "tensorflow/core/framework/tensor_shape.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/profiler/lib/traceme.h"
33 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
34 
35 namespace tensorflow {
36 
37 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
38 constexpr uint64 BFCAllocator::kMemDebugHistorySize;
39 
BFCAllocator(SubAllocator * sub_allocator,size_t total_memory,bool allow_growth,const string & name,bool garbage_collection)40 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
41                            bool allow_growth, const string& name,
42                            bool garbage_collection)
43     : garbage_collection_(garbage_collection),
44       coalesce_regions_(sub_allocator->SupportsCoalescing()),
45       sub_allocator_(sub_allocator),
46       name_(name),
47       free_chunks_list_(kInvalidChunkHandle),
48       next_allocation_id_(1) {
49   if (allow_growth) {
50     // 2MiB smallest initial allocation, unless total memory available
51     // is less.
52     curr_region_allocation_bytes_ =
53         RoundedBytes(std::min(total_memory, size_t{2 << 20}));
54   } else {
55     curr_region_allocation_bytes_ = RoundedBytes(total_memory);
56   }
57 
58   // Allocate the requested amount of memory.
59   memory_limit_ = total_memory;
60   stats_.bytes_limit = static_cast<int64>(total_memory);
61 
62   // Create a bunch of bins of various good sizes.
63 
64   // We create bins to fit all possible ranges that cover the
65   // memory_limit_ starting from allocations up to 256 bytes to
66   // allocations up to (and including) the memory limit.
67   for (BinNum b = 0; b < kNumBins; b++) {
68     size_t bin_size = BinNumToSize(b);
69     VLOG(1) << "Creating bin of max chunk size "
70             << strings::HumanReadableNumBytes(bin_size);
71     new (BinFromIndex(b)) Bin(this, bin_size);
72     CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
73     CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
74     CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
75     if (b + 1 < kNumBins) {
76       CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
77     }
78   }
79 }
80 
~BFCAllocator()81 BFCAllocator::~BFCAllocator() {
82   // Return memory back.
83   VLOG(2) << "Number of regions allocated: "
84           << region_manager_.regions().size();
85   for (const auto& region : region_manager_.regions()) {
86     sub_allocator_->Free(region.ptr(), region.memory_size());
87   }
88 
89   for (BinNum b = 0; b < kNumBins; b++) {
90     BinFromIndex(b)->~Bin();
91   }
92 }
93 
ChunkFromHandle(ChunkHandle h)94 BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
95   DCHECK_GE(h, 0);
96   DCHECK_LT(h, static_cast<int>(chunks_.size()));
97   return &(chunks_[h]);
98 }
99 
ChunkFromHandle(ChunkHandle h) const100 const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
101   DCHECK_GE(h, 0);
102   DCHECK_LT(h, static_cast<int>(chunks_.size()));
103   return &(chunks_[h]);
104 }
105 
Extend(size_t alignment,size_t rounded_bytes)106 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
107   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
108   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
109   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
110 
111   // Do we have enough space to handle the client's request?
112   // If not, fail immediately.
113   if (rounded_bytes > available_bytes) {
114     return false;
115   }
116 
117   // If curr_region_allocation_bytes_ is not enough to satisfy the
118   // allocation, keep multiplying by a power of two until that is
119   // sufficient.
120   bool increased_allocation = false;
121   while (rounded_bytes > curr_region_allocation_bytes_) {
122     curr_region_allocation_bytes_ *= 2;
123     increased_allocation = true;
124   }
125 
126   // Try allocating.
127   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
128   size_t bytes_received;
129   void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
130   if (mem_addr == nullptr && !started_backpedal_) {
131     // Only backpedal once.
132     started_backpedal_ = true;
133 
134     static constexpr float kBackpedalFactor = 0.9;
135 
136     // Try allocating less memory.
137     while (mem_addr == nullptr) {
138       bytes = RoundedBytes(bytes * kBackpedalFactor);
139       if (bytes < rounded_bytes) break;
140       mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
141     }
142   }
143 
144   if (mem_addr == nullptr) {
145     return false;
146   }
147 
148   if (!increased_allocation) {
149     // Increase the region size of the next required allocation.
150     curr_region_allocation_bytes_ *= 2;
151   }
152 
153   VLOG(1) << "Extending allocation by "
154           << strings::HumanReadableNumBytes(bytes_received) << " bytes.";
155 
156   total_region_allocated_bytes_ += bytes_received;
157   VLOG(1) << "Total allocated bytes: "
158           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
159 
160   VLOG(1) << "Allocated memory at " << mem_addr << " to "
161           << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
162 
163   AllocationRegion* maybe_extended_region = nullptr;
164   if (coalesce_regions_) {
165     maybe_extended_region =
166         region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
167   } else {
168     region_manager_.AddAllocationRegion(mem_addr, bytes_received);
169   }
170 
171   // Create one large chunk for the whole memory space that will
172   // be chunked later.
173   ChunkHandle h = AllocateChunk();
174   BFCAllocator::Chunk* c = ChunkFromHandle(h);
175   c->ptr = mem_addr;
176   c->size = bytes_received;
177   c->allocation_id = -1;
178   c->prev = kInvalidChunkHandle;
179   c->next = kInvalidChunkHandle;
180   c->freed_at_count = 0;
181 
182   region_manager_.set_handle(c->ptr, h);
183 
184   // If the region was extended, then there exists a previous chunk that should
185   // be linked to the new chunk.
186   if (maybe_extended_region != nullptr) {
187     ChunkHandle prev =
188         maybe_extended_region->get_handle(maybe_extended_region->ptr());
189     BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
190     // Find the last recorded chunk in the extended region.
191     while (prev_chunk->next != kInvalidChunkHandle) {
192       prev = prev_chunk->next;
193       prev_chunk = ChunkFromHandle(prev);
194     }
195     c->prev = prev;
196     prev_chunk->next = h;
197   }
198 
199   // Maybe merge adjacent chunks and insert the chunk into the right bin.
200   InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
201 
202   return true;
203 }
204 
AllocateChunk()205 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
206   if (free_chunks_list_ != kInvalidChunkHandle) {
207     ChunkHandle h = free_chunks_list_;
208     Chunk* c = ChunkFromHandle(h);
209     free_chunks_list_ = c->next;
210     return h;
211   } else {
212     ChunkHandle h = chunks_.size();
213     chunks_.resize(h + 1);
214     return h;
215   }
216 }
217 
DeallocateChunk(ChunkHandle h)218 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
219   Chunk* c = ChunkFromHandle(h);
220   c->allocation_id = -1;
221   c->bin_num = kInvalidBinNum;
222   c->next = free_chunks_list_;
223   free_chunks_list_ = h;
224 }
225 
AllocateRawInternalWithRetry(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)226 void* BFCAllocator::AllocateRawInternalWithRetry(
227     size_t unused_alignment, size_t num_bytes,
228     const AllocationAttributes& allocation_attr) {
229   // Fast path: Try once to allocate without getting the retry_helper_ involved
230   uint64 freed_by_count = 0;
231   if (allocation_attr.freed_by_func != nullptr) {
232     freed_by_count = (*allocation_attr.freed_by_func)();
233   }
234   void* r =
235       AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
236   if (r != nullptr) {
237     return r;
238   } else {
239     static const int64 kMaxMillisToWait = 10000;  // 10 seconds
240     r = retry_helper_.AllocateRaw(
241         [this, &allocation_attr](size_t a, size_t nb, bool v) {
242           uint64 freed_by_count = 0;
243           if (allocation_attr.freed_by_func != nullptr) {
244             freed_by_count = (*allocation_attr.freed_by_func)();
245           }
246           return AllocateRawInternal(a, nb, v, freed_by_count);
247         },
248         kMaxMillisToWait, unused_alignment, num_bytes);
249     return r;
250   }
251 }
252 
AllocateRaw(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)253 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
254                                 const AllocationAttributes& allocation_attr) {
255   VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
256   if (!allocation_attr.retry_on_failure) {
257     // Return immediately upon the first failure if this is for allocating an
258     // optional scratch space.
259     bool dump_log_on_failure = VLOG_IS_ON(2);
260     uint64 freed_by_count = 0;
261     if (allocation_attr.freed_by_func != nullptr) {
262       freed_by_count = (*allocation_attr.freed_by_func)();
263     }
264     void* result = AllocateRawInternal(unused_alignment, num_bytes,
265                                        dump_log_on_failure, freed_by_count);
266     if (result == nullptr) {
267       static std::atomic<int32> log_counter{0};
268       int32 counter_value = log_counter.load(std::memory_order_relaxed);
269       if (counter_value < 10) {
270         log_counter.store(counter_value + 1, std::memory_order_relaxed);
271         LOG(WARNING)
272             << "Allocator (" << Name() << ") ran out of memory trying "
273             << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
274             << " with freed_by_count=" << freed_by_count
275 
276             << ". The caller indicates that this is not a failure, but"
277             << " may mean that there could be performance gains if more"
278             << " memory were available.";
279       }
280     }
281     return result;
282   } else {
283     return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
284                                         allocation_attr);
285   }
286 }
287 
288 // static
RoundedBytes(size_t bytes)289 size_t BFCAllocator::RoundedBytes(size_t bytes) {
290   size_t rounded_bytes =
291       (kMinAllocationSize *
292        ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
293   DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
294   return rounded_bytes;
295 }
296 
DeallocateFreeRegions(size_t rounded_bytes)297 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
298     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
299   // Do nothing if garbage collection is off.
300   if (!garbage_collection_) {
301     return false;
302   }
303 
304   // Searching for free regions.
305   absl::flat_hash_set<void*> free_region_ptrs;
306   size_t total_free_bytes = 0;
307   for (const AllocationRegion& region : region_manager_.regions()) {
308     ChunkHandle h = region_manager_.get_handle(region.ptr());
309     bool any_use = false;
310     while (h != kInvalidChunkHandle) {
311       const Chunk* c = ChunkFromHandle(h);
312       if (c->in_use()) {
313         any_use = true;
314         break;
315       }
316       h = c->next;
317     }
318 
319     if (!any_use) {
320       VLOG(2) << "Found free region with ptr = " << region.ptr();
321       free_region_ptrs.insert(region.ptr());
322       total_free_bytes += region.memory_size();
323     }
324   }
325 
326   if (total_free_bytes == 0) {
327     return false;
328   }
329 
330   // Rough estimation to check whether deallocation can help.
331   size_t available_bytes =
332       memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
333   if (rounded_bytes > available_bytes) {
334     return false;
335   }
336 
337   LOG(WARNING) << "Garbage collection: deallocate free memory regions"
338                << " (i.e., allocations) so that we can re-allocate a larger"
339                << " region to avoid OOM due to memory fragmentation. If you"
340                << " see this message frequently, you are running near the"
341                << " threshold of the available device memory and re-allocation"
342                << " may incur great performance overhead. You may try smaller"
343                << " batch sizes to observe the performance impact."
344                << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
345                << " disable this feature.";
346 
347   // Deallocate free regions.
348   DeallocateRegions(free_region_ptrs);
349 
350   return true;
351 }
352 
DeallocateRegions(const absl::flat_hash_set<void * > & region_ptrs)353 void BFCAllocator::DeallocateRegions(
354     const absl::flat_hash_set<void*>& region_ptrs)
355     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
356   // Explicitly remove the const qualifier as some compilers disallow passing
357   // const_iterator to std::vector::erase(), which is used in
358   // RemoveAllocationRegion().
359   auto regions =
360       const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
361   auto it = regions->begin();
362   while (it != regions->end()) {
363     if (!region_ptrs.contains(it->ptr())) {
364       ++it;
365       continue;
366     }
367 
368     VLOG(2) << "Deallocate region with ptr = " << it->ptr();
369     // Remove all chunk registrations from Bins.
370     ChunkHandle h = region_manager_.get_handle(it->ptr());
371     while (h != kInvalidChunkHandle) {
372       const Chunk* c = ChunkFromHandle(h);
373       if (c->bin_num != kInvalidBinNum) {
374         RemoveFreeChunkFromBin(h);
375       }
376       auto h_to_delete = h;
377       h = c->next;
378       DeleteChunk(h_to_delete);
379     }
380 
381     // Deallocate the memory.
382     sub_allocator_->Free(it->ptr(), it->memory_size());
383     total_region_allocated_bytes_ -= it->memory_size();
384     it = region_manager_.RemoveAllocationRegion(it);
385   }
386 }
387 
AllocateRawInternal(size_t unused_alignment,size_t num_bytes,bool dump_log_on_failure,uint64 freed_before)388 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
389                                         size_t num_bytes,
390                                         bool dump_log_on_failure,
391                                         uint64 freed_before) {
392   if (num_bytes == 0) {
393     VLOG(2) << "tried to allocate 0 bytes";
394     return nullptr;
395   }
396   // First, always allocate memory of at least kMinAllocationSize
397   // bytes, and always allocate multiples of kMinAllocationSize bytes
398   // so all memory addresses are nicely byte aligned.
399   size_t rounded_bytes = RoundedBytes(num_bytes);
400 
401   // The BFC allocator tries to find the best fit first.
402   BinNum bin_num = BinNumForSize(rounded_bytes);
403 
404   mutex_lock l(lock_);
405   if (!timestamped_chunks_.empty()) {
406     // Merge timestamped chunks whose counts have become safe for general use.
407     MergeTimestampedChunks(0);
408   }
409   void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
410   if (ptr != nullptr) {
411     AddTraceMe("MemoryAllocation", ptr);
412     return ptr;
413   }
414 
415   // Try to extend
416   if (Extend(unused_alignment, rounded_bytes)) {
417     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
418     if (ptr != nullptr) {
419       AddTraceMe("MemoryAllocation", ptr);
420       return ptr;
421     }
422   }
423 
424   if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
425     // We're unable to satisfy an allocation request without a specific
426     // timestamp requirement.  Rather than fail, try merging any held-out
427     // timestamped chunks more aggressively until a free chunk of the necessary
428     // size is formed.
429     if (MergeTimestampedChunks(rounded_bytes)) {
430       ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
431       if (ptr != nullptr) {
432         AddTraceMe("MemoryAllocation", ptr);
433         return ptr;
434       }
435     }
436   }
437 
438   // Reaching this point means that no chunks can satisfy the request. Also,
439   // the unallocated bytes cannot satisfy the request. Before giving up, let's
440   // try deallocating free regions so that suballocator can combine them with
441   // the unallocated bytes and form a larger region.
442   if (DeallocateFreeRegions(rounded_bytes) &&
443       Extend(unused_alignment, rounded_bytes)) {
444     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
445     if (ptr != nullptr) {
446       AddTraceMe("MemoryAllocation", ptr);
447       return ptr;
448     }
449   }
450 
451   // We searched all bins for an existing free chunk to use and
452   // couldn't find one.  This means we must have run out of memory,
453   // Dump the memory log for analysis.
454   MaybeWriteMemoryMap();
455   if (dump_log_on_failure) {
456     LOG(WARNING)
457         << "Allocator (" << Name() << ") ran out of memory trying "
458         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
459         << " (rounded to " << rounded_bytes << ")"
460         << "requested by op "
461         << ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
462         << "\nCurrent allocation summary follows.";
463     DumpMemoryLog(rounded_bytes);
464     LOG(WARNING) << RenderOccupancy();
465   }
466   return nullptr;
467 }
468 
LargestFreeChunk()469 int64 BFCAllocator::LargestFreeChunk() {
470   for (int i = kNumBins - 1; i >= 0; i--) {
471     if (!BinFromIndex(i)->free_chunks.empty()) {
472       return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
473     }
474   }
475   return 0;
476 }
477 
GetFragmentation()478 double BFCAllocator::GetFragmentation() {
479   int64 bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
480   DCHECK_GT(bytes_available, 0);
481   return static_cast<double>(bytes_available - LargestFreeChunk()) /
482          bytes_available;
483 }
484 
AddTraceMe(absl::string_view traceme_name,const void * ptr)485 void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
486   BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
487   AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
488 }
489 
AddTraceMe(absl::string_view traceme_name,const void * chunk_ptr,int64 req_bytes,int64 alloc_bytes)490 void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
491                               const void* chunk_ptr, int64 req_bytes,
492                               int64 alloc_bytes) {
493   tensorflow::profiler::TraceMe::InstantActivity(
494       [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
495           TF_NO_THREAD_SAFETY_ANALYSIS {
496             int64 bytes_available =
497                 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
498             const auto& annotation =
499                 ScopedMemoryDebugAnnotation::CurrentAnnotation();
500             std::string tensor_shape;
501             if (annotation.pending_shape) {
502               tensor_shape = annotation.pending_shape->DebugString();
503             }
504             return tensorflow::profiler::TraceMeEncode(
505                 traceme_name, {{"allocator_name", name_},
506                                {"bytes_reserved", stats_.bytes_reserved},
507                                {"bytes_allocated", stats_.bytes_in_use},
508                                {"bytes_available", bytes_available},
509                                {"fragmentation", GetFragmentation()},
510                                {"peak_bytes_in_use", stats_.peak_bytes_in_use},
511                                {"requested_bytes", req_bytes},
512                                {"allocation_bytes", alloc_bytes},
513                                {"addr", reinterpret_cast<uint64>(chunk_ptr)},
514                                {"tf_op", annotation.pending_op_name},
515                                {"id", annotation.pending_step_id},
516                                {"region_type", annotation.pending_region_type},
517                                {"data_type", annotation.pending_data_type},
518                                {"shape", tensor_shape}});
519           },
520       /*level=*/profiler::TraceMeLevel::kInfo);
521 }
522 
FindChunkPtr(BinNum bin_num,size_t rounded_bytes,size_t num_bytes,uint64 freed_before)523 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
524                                  size_t num_bytes, uint64 freed_before) {
525   // First identify the first bin that could satisfy rounded_bytes.
526   for (; bin_num < kNumBins; bin_num++) {
527     // Start searching from the first bin for the smallest chunk that fits
528     // rounded_bytes.
529     Bin* b = BinFromIndex(bin_num);
530     for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
531          ++citer) {
532       const BFCAllocator::ChunkHandle h = (*citer);
533       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
534       DCHECK(!chunk->in_use());
535       if (freed_before > 0 && freed_before < chunk->freed_at_count) {
536         continue;
537       }
538       if (chunk->size >= rounded_bytes) {
539         // We found an existing chunk that fits us that wasn't in use, so remove
540         // it from the free bin structure prior to using.
541         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
542 
543         // If we can break the size of the chunk into two reasonably large
544         // pieces, do so.  In any case don't waste more than
545         // kMaxInternalFragmentation bytes on padding this alloc.
546         const int64 kMaxInternalFragmentation = 128 << 20;  // 128mb
547         if (chunk->size >= rounded_bytes * 2 ||
548             static_cast<int64>(chunk->size) - rounded_bytes >=
549                 kMaxInternalFragmentation) {
550           SplitChunk(h, rounded_bytes);
551           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
552         }
553 
554         // The requested size of the returned chunk is what the user
555         // has allocated.
556         chunk->requested_size = num_bytes;
557         // Assign a unique id and increment the id counter, marking the
558         // chunk as being in use.
559         chunk->allocation_id = next_allocation_id_++;
560 
561         // Update stats.
562         ++stats_.num_allocs;
563         stats_.bytes_in_use += chunk->size;
564         stats_.peak_bytes_in_use =
565             std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
566         stats_.largest_alloc_size =
567             std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
568         if (ShouldRecordOpName()) {
569           const auto& annotation =
570               ScopedMemoryDebugAnnotation::CurrentAnnotation();
571           chunk->op_name = annotation.pending_op_name;
572           if (!annotation.pending_op_name) {
573             VLOG(2) << "missing pending_op_name for " << Name()
574                     << " reading addr "
575                     << static_cast<const void*>(&annotation.pending_op_name)
576                     << "\n"
577                     << CurrentStackTrace();
578           }
579           chunk->step_id = annotation.pending_step_id;
580           chunk->action_count = ++action_counter_;
581           uint64 slot = chunk->action_count % kMemDebugHistorySize;
582           size_history_[slot] = stats_.bytes_in_use;
583         }
584 
585         VLOG(4) << "Returning: " << chunk->ptr;
586         if (VLOG_IS_ON(4)) {
587           LOG(INFO) << "A: " << RenderOccupancy();
588         }
589         return chunk->ptr;
590       }
591     }
592   }
593 
594   return nullptr;
595 }
596 
SplitChunk(BFCAllocator::ChunkHandle h,size_t num_bytes)597 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
598   // Allocate the new chunk before we do any ChunkFromHandle
599   ChunkHandle h_new_chunk = AllocateChunk();
600 
601   Chunk* c = ChunkFromHandle(h);
602   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
603 
604   // Create a new chunk starting num_bytes after c
605   BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
606   new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
607   region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
608 
609   // Set the new sizes of the chunks.
610   new_chunk->size = c->size - num_bytes;
611   c->size = num_bytes;
612 
613   // The new chunk is not in use.
614   new_chunk->allocation_id = -1;
615 
616   // It inherits the freed time.
617   new_chunk->freed_at_count = c->freed_at_count;
618 
619   // Maintain the pointers.
620   // c <-> c_neighbor becomes
621   // c <-> new_chunk <-> c_neighbor
622   BFCAllocator::ChunkHandle h_neighbor = c->next;
623   new_chunk->prev = h;
624   new_chunk->next = h_neighbor;
625   c->next = h_new_chunk;
626   if (h_neighbor != kInvalidChunkHandle) {
627     Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
628     c_neighbor->prev = h_new_chunk;
629   }
630 
631   // Add the newly free chunk to the free bin.
632   InsertFreeChunkIntoBin(h_new_chunk);
633 }
634 
DeallocateRaw(void * ptr)635 void BFCAllocator::DeallocateRaw(void* ptr) {
636   VLOG(1) << "DeallocateRaw " << Name() << " "
637           << (ptr ? RequestedSize(ptr) : 0);
638   DeallocateRawInternal(ptr);
639   retry_helper_.NotifyDealloc();
640 }
641 
DeallocateRawInternal(void * ptr)642 void BFCAllocator::DeallocateRawInternal(void* ptr) {
643   if (ptr == nullptr) {
644     VLOG(2) << "tried to deallocate nullptr";
645     return;
646   }
647   mutex_lock l(lock_);
648 
649   // Find the chunk from the ptr.
650   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
651   CHECK(h != kInvalidChunkHandle);
652   // Record chunk information before it's freed.
653   Chunk* chunk = ChunkFromHandle(h);
654   void* chunk_ptr = chunk->ptr;
655   int64 req_bytes = chunk->requested_size;
656   int64 alloc_bytes = chunk->size;
657 
658   MarkFree(h);
659 
660   // Consider coalescing it.
661   if (timing_counter_) {
662     InsertFreeChunkIntoBin(h);
663     timestamped_chunks_.push_back(h);
664   } else {
665     InsertFreeChunkIntoBin(TryToCoalesce(h, false));
666   }
667 
668   // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
669   // correct aggregation stats (bytes_in_use, fragmentation).
670   AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
671 
672   if (VLOG_IS_ON(4)) {
673     LOG(INFO) << "F: " << RenderOccupancy();
674   }
675 }
676 
677 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
678 // We merge Chunk(h2) into Chunk(h1).
Merge(BFCAllocator::ChunkHandle h1,BFCAllocator::ChunkHandle h2)679 void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
680                          BFCAllocator::ChunkHandle h2) {
681   Chunk* c1 = ChunkFromHandle(h1);
682   Chunk* c2 = ChunkFromHandle(h2);
683   // We can only merge chunks that are not in use.
684   CHECK(!c1->in_use() && !c2->in_use());
685 
686   // c1's prev doesn't change, still points to the same ptr, and is
687   // still not in use.
688 
689   // Fix up neighbor pointers
690   //
691   // c1 <-> c2 <-> c3 should become
692   // c1 <-> c3
693 
694   BFCAllocator::ChunkHandle h3 = c2->next;
695   c1->next = h3;
696   CHECK(c2->prev == h1);
697   if (h3 != kInvalidChunkHandle) {
698     BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
699     c3->prev = h1;
700   }
701 
702   // Set the new size
703   c1->size += c2->size;
704 
705   // Pick latest free time.
706   c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
707 
708   DeleteChunk(h2);
709 }
710 
DeleteChunk(ChunkHandle h)711 void BFCAllocator::DeleteChunk(ChunkHandle h) {
712   // Delete h and cleanup all state
713   Chunk* c = ChunkFromHandle(h);
714   //  VLOG(4) << "Removing: " << c->ptr;
715   region_manager_.erase(c->ptr);
716   DeallocateChunk(h);
717 }
718 
InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h)719 void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
720   Chunk* c = ChunkFromHandle(h);
721   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
722   BinNum bin_num = BinNumForSize(c->size);
723   Bin* new_bin = BinFromIndex(bin_num);
724   c->bin_num = bin_num;
725   new_bin->free_chunks.insert(h);
726 }
727 
RemoveFreeChunkIterFromBin(BFCAllocator::Bin::FreeChunkSet * free_chunks,const BFCAllocator::Bin::FreeChunkSet::iterator & citer)728 void BFCAllocator::RemoveFreeChunkIterFromBin(
729     BFCAllocator::Bin::FreeChunkSet* free_chunks,
730     const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
731   ChunkHandle h = *citer;
732   Chunk* c = ChunkFromHandle(h);
733   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
734   free_chunks->erase(citer);
735   c->bin_num = kInvalidBinNum;
736 }
737 
RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h)738 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
739   Chunk* c = ChunkFromHandle(h);
740   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
741   CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
742       << "Could not find chunk in bin";
743   c->bin_num = kInvalidBinNum;
744 }
745 
MarkFree(BFCAllocator::ChunkHandle h)746 void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
747   Chunk* c = ChunkFromHandle(h);
748   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
749 
750   // Mark the chunk as no longer in use.
751   c->allocation_id = -1;
752 
753   // Optionally record the free time.
754   if (timing_counter_) {
755     c->freed_at_count = timing_counter_->next();
756   }
757 
758   // Updates the stats.
759   stats_.bytes_in_use -= c->size;
760 
761   if (ShouldRecordOpName()) {
762     c->action_count = ++action_counter_;
763     uint64 slot = c->action_count % kMemDebugHistorySize;
764     size_history_[slot] = stats_.bytes_in_use;
765   }
766 }
767 
TryToCoalesce(ChunkHandle h,bool ignore_freed_at)768 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
769                                                       bool ignore_freed_at) {
770   Chunk* c = ChunkFromHandle(h);
771   if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
772   ChunkHandle coalesced_chunk = h;
773 
774   // If the next chunk is free, merge it into c and delete it.
775   if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
776     Chunk* n = ChunkFromHandle(c->next);
777     if ((n->freed_at_count == 0) || ignore_freed_at) {
778       VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
779       RemoveFreeChunkFromBin(c->next);
780       Merge(h, c->next);
781     }
782   }
783 
784   // If the previous chunk is free, merge c into it and delete c.
785   if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
786     Chunk* n = ChunkFromHandle(c->prev);
787     if ((n->freed_at_count == 0) || ignore_freed_at) {
788       VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
789       coalesced_chunk = c->prev;
790       RemoveFreeChunkFromBin(c->prev);
791       Merge(c->prev, h);
792     }
793   }
794 
795   return coalesced_chunk;
796 }
797 
SetSafeFrontier(uint64 count)798 void BFCAllocator::SetSafeFrontier(uint64 count) {
799   uint64 current = safe_frontier_.load(std::memory_order_relaxed);
800   while (count > current) {
801     if (safe_frontier_.compare_exchange_strong(current, count)) {
802       retry_helper_.NotifyDealloc();
803       return;
804     } else {
805       current = safe_frontier_.load(std::memory_order_relaxed);
806     }
807   }
808 }
809 
MergeTimestampedChunks(size_t required_bytes)810 bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
811   VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
812           << " required_bytes=" << required_bytes;
813   bool satisfied = (required_bytes == 0);
814   std::vector<void*> to_merge;
815   std::deque<ChunkHandle> new_ts_queue;
816   while (!timestamped_chunks_.empty()) {
817     ChunkHandle h = timestamped_chunks_.front();
818     timestamped_chunks_.pop_front();
819     DCHECK_NE(h, kInvalidChunkHandle);
820     Chunk* c = ChunkFromHandle(h);
821     // It's possible this chunk has already been merged so refetch and retest
822     // the handle.
823     h = region_manager_.get_handle(c->ptr);
824     if (h == kInvalidChunkHandle) {
825       continue;
826     }
827     if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
828       // This chunk has already been reallocated.
829       continue;
830     }
831     if (c->freed_at_count == 0) {
832       to_merge.push_back(c->ptr);
833       continue;
834     }
835     // Chunk should be free and assigned to a bin.
836     DCHECK_NE(c->bin_num, kInvalidBinNum);
837     if (c->freed_at_count < safe_frontier_) {
838       c->freed_at_count = 0;
839       to_merge.push_back(c->ptr);
840     } else if (required_bytes > 0) {
841       to_merge.push_back(c->ptr);
842     } else {
843       new_ts_queue.push_back(h);
844     }
845   }
846   DCHECK(timestamped_chunks_.empty());
847   std::swap(timestamped_chunks_, new_ts_queue);
848 
849   // At this point all candidate chunks have been moved from timestamped_chunks_
850   // to to_merge.  If this is a standard merge (required_bytes == 0) then
851   // merge them all, otherwise merge just until a Chunk of the required size
852   // is produced.
853   for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
854     void* ptr = to_merge[ci];
855     // It's possible that the Chunk associated with this memory location got
856     // merged and deallocated in a prior iteration so refetch the handle and
857     // retest.
858     ChunkHandle h = region_manager_.get_handle(ptr);
859     if (h == kInvalidChunkHandle) continue;
860     if (required_bytes == 0 || !satisfied) {
861       Chunk* c = ChunkFromHandle(h);
862       DCHECK_NE(c->bin_num, kInvalidBinNum);
863       DCHECK(!c->in_use());
864       RemoveFreeChunkFromBin(h);
865       ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
866       InsertFreeChunkIntoBin(new_h);
867       if (required_bytes > 0) {
868         c = ChunkFromHandle(new_h);
869         if (new_h != h && c->freed_at_count > 0) {
870           timestamped_chunks_.push_back(new_h);
871         }
872         if (c->size >= required_bytes) {
873           satisfied = true;
874         }
875       }
876     } else {
877       // We were force merging Chunks with unsafe timestamps, but managed
878       // to create a satisfying Chunk so just requeue the rest.
879       timestamped_chunks_.push_back(h);
880     }
881   }
882   return satisfied;
883 }
884 
TracksAllocationSizes() const885 bool BFCAllocator::TracksAllocationSizes() const { return true; }
886 
RequestedSize(const void * ptr) const887 size_t BFCAllocator::RequestedSize(const void* ptr) const {
888   CHECK(ptr);
889   mutex_lock l(lock_);
890   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
891   CHECK(h != kInvalidChunkHandle)
892       << "Asked for requested size of pointer we never allocated: " << ptr;
893   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
894   return c->requested_size;
895 }
896 
AllocatedSize(const void * ptr) const897 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
898   mutex_lock l(lock_);
899   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
900   CHECK(h != kInvalidChunkHandle)
901       << "Asked for allocated size of pointer we never allocated: " << ptr;
902   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
903   return c->size;
904 }
905 
AllocationId(const void * ptr) const906 int64 BFCAllocator::AllocationId(const void* ptr) const {
907   mutex_lock l(lock_);
908   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
909   CHECK(h != kInvalidChunkHandle)
910       << "Asked for allocation id of pointer we never allocated: " << ptr;
911   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
912   return c->allocation_id;
913 }
914 
915 namespace {
916 
RenderRegion(char * rendered,const size_t resolution,const size_t total_render_size,const size_t offset,const void * base_ptr,const void * ptr,const size_t size,const char c)917 void RenderRegion(char* rendered, const size_t resolution,
918                   const size_t total_render_size, const size_t offset,
919                   const void* base_ptr, const void* ptr, const size_t size,
920                   const char c) {
921   const char* base_ptr_c = static_cast<const char*>(base_ptr);
922   const char* ptr_c = static_cast<const char*>(ptr);
923 
924   size_t start_location =
925       ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
926   CHECK_GE(start_location, 0);
927   CHECK_LT(start_location, resolution);
928   size_t end_location =
929       ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
930       total_render_size;
931   CHECK_GE(end_location, 0);
932   CHECK_LT(end_location, resolution);
933 
934   for (size_t i = start_location; i <= end_location; ++i) {
935     rendered[i] = c;
936   }
937 }
938 
939 }  // namespace
940 
RenderOccupancy()941 string BFCAllocator::RenderOccupancy() {
942   // Make a buffer for the ASCII-art representation.
943   const size_t resolution = 100;
944   char rendered[resolution];
945 
946   // Compute the total region size to render over
947   size_t total_region_size = 0;
948   for (const auto& region : region_manager_.regions()) {
949     total_region_size += region.memory_size();
950   }
951 
952   if (total_region_size == 0) {
953     return "<allocator contains no memory>";
954   }
955 
956   // Start out with everything empty
957   RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
958                total_region_size, '_');
959 
960   size_t region_offset = 0;
961   for (const auto& region : region_manager_.regions()) {
962     ChunkHandle h = region_manager_.get_handle(region.ptr());
963     // Then render each chunk left to right.
964     while (h != kInvalidChunkHandle) {
965       Chunk* c = ChunkFromHandle(h);
966       if (c->in_use()) {
967         // Render the wasted space
968         size_t wasted = c->size - c->requested_size;
969         if (wasted > 0) {
970           RenderRegion(rendered, resolution, total_region_size,
971                        region_offset + c->requested_size, region.ptr(), c->ptr,
972                        wasted, 'x');
973         }
974         // Then the occupied space
975         RenderRegion(rendered, resolution, total_region_size, region_offset,
976                      region.ptr(), c->ptr, c->requested_size, '*');
977       }
978       h = c->next;
979     }
980     region_offset += region.memory_size();
981   }
982 
983   return string(rendered, resolution);
984 }
985 
DumpMemoryLog(size_t num_bytes)986 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
987   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
988   LOG(INFO) << "BFCAllocator dump for " << Name();
989   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
990     Bin* b = BinFromIndex(bin_num);
991     const BinDebugInfo& bin_info = bin_infos[bin_num];
992     CHECK_EQ(b->free_chunks.size(),
993              bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
994 
995     LOG(INFO) << "Bin (" << b->bin_size
996               << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
997               << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
998               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
999               << " allocated for chunks. "
1000               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1001               << " in use in bin. "
1002               << strings::HumanReadableNumBytes(
1003                      bin_info.total_requested_bytes_in_use)
1004               << " client-requested in use in bin.";
1005   }
1006 
1007   // Find the bin that we would have liked to allocate in, so we
1008   // can get some further analysis about fragmentation.
1009   Bin* b = BinForSize(num_bytes);
1010 
1011   LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1012             << " was " << strings::HumanReadableNumBytes(b->bin_size)
1013             << ", Chunk State: ";
1014 
1015   for (ChunkHandle h : b->free_chunks) {
1016     Chunk* c = ChunkFromHandle(h);
1017     LOG(INFO) << c->DebugString(this, true);
1018   }
1019 
1020   // Next show the chunks that are in use, and also summarize their
1021   // number by size.
1022   std::map<size_t, int> in_use_by_size;
1023   for (const auto& region : region_manager_.regions()) {
1024     LOG(INFO) << "Next region of size " << region.memory_size();
1025     ChunkHandle h = region_manager_.get_handle(region.ptr());
1026     while (h != kInvalidChunkHandle) {
1027       const Chunk* c = ChunkFromHandle(h);
1028       if (c->in_use()) {
1029         in_use_by_size[c->size]++;
1030       }
1031       string buf = strings::StrCat(
1032           (c->in_use() ? "InUse" : "Free "), " at ",
1033           strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1034       if (ShouldRecordOpName()) {
1035         strings::StrAppend(&buf, " by op ", c->GetDebugOpName(),
1036                            " action_count ", c->action_count, " step ",
1037                            c->step_id);
1038       }
1039       strings::StrAppend(&buf, " next ", c->next);
1040       if (timing_counter_) {
1041         strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1042       }
1043       LOG(INFO) << buf;
1044       h = c->next;
1045     }
1046   }
1047 
1048   LOG(INFO) << "     Summary of in-use Chunks by size: ";
1049   size_t total_bytes = 0;
1050   for (auto& it : in_use_by_size) {
1051     LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1052               << strings::HumanReadableNumBytes(it.first * it.second);
1053     total_bytes += (it.first * it.second);
1054   }
1055   LOG(INFO) << "Sum Total of in-use chunks: "
1056             << strings::HumanReadableNumBytes(total_bytes);
1057   LOG(INFO) << "total_region_allocated_bytes_: "
1058             << total_region_allocated_bytes_
1059             << " memory_limit_: " << memory_limit_ << " available bytes: "
1060             << (memory_limit_ - total_region_allocated_bytes_)
1061             << " curr_region_allocation_bytes_: "
1062             << curr_region_allocation_bytes_;
1063   LOG(INFO) << "Stats: \n" << stats_.DebugString();
1064 }
1065 
MaybeWriteMemoryMap()1066 void BFCAllocator::MaybeWriteMemoryMap() {
1067   const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1068   if (gpu_memory_map_file != nullptr) {
1069     std::unique_ptr<WritableFile> dump_file;
1070     string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1071                                        Env::Default()->NowMicros());
1072     Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1073     if (!status.ok()) {
1074       LOG(ERROR) << "Failed to open file " << file_name;
1075       return;
1076     }
1077     MemoryDump md = RecordMemoryMapInternal();
1078     status = dump_file->Append(md.SerializeAsString());
1079     if (!status.ok()) {
1080       LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1081                  << status;
1082     }
1083   }
1084 }
1085 
RecordMemoryMap()1086 MemoryDump BFCAllocator::RecordMemoryMap() {
1087   mutex_lock l(lock_);
1088   return RecordMemoryMapInternal();
1089 }
1090 
RecordMemoryMapInternal()1091 MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1092   MemoryDump md;
1093   md.set_allocator_name(Name());
1094 
1095   // Record the general stats
1096   MemAllocatorStats* mas = md.mutable_stats();
1097   mas->set_num_allocs(stats_.num_allocs);
1098   mas->set_bytes_in_use(stats_.bytes_in_use);
1099   mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1100   mas->set_largest_alloc_size(stats_.largest_alloc_size);
1101 
1102   // Record summary data for every bin.
1103   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1104   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1105     Bin* b = BinFromIndex(bin_num);
1106     const BinDebugInfo& bin_info = bin_infos[bin_num];
1107     DCHECK_EQ(b->free_chunks.size(),
1108               bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1109     BinSummary* bs = md.add_bin_summary();
1110     bs->set_bin(bin_num);
1111     bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1112     bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1113     bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1114     bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1115   }
1116 
1117   // Record state of every defined Chunk.
1118   for (const auto& region : region_manager_.regions()) {
1119     ChunkHandle h = region_manager_.get_handle(region.ptr());
1120     while (h != kInvalidChunkHandle) {
1121       const Chunk* c = ChunkFromHandle(h);
1122       MemChunk* mc = md.add_chunk();
1123       mc->set_in_use(c->in_use());
1124       mc->set_address(reinterpret_cast<uint64>(c->ptr));
1125       mc->set_size(c->size);
1126       mc->set_requested_size(c->requested_size);
1127       mc->set_bin(c->bin_num);
1128       mc->set_op_name(c->GetDebugOpName());
1129       mc->set_step_id(c->step_id);
1130       mc->set_action_count(c->action_count);
1131       if (timing_counter_) {
1132         mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
1133       }
1134       h = c->next;
1135     }
1136   }
1137 
1138   mas->set_fragmentation_metric(GetFragmentation());
1139 
1140   // Record the recent size history
1141   uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
1142   for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
1143     SnapShot* ss = md.add_snap_shot();
1144     ss->set_action_count(i);
1145     uint64 slot = i % kMemDebugHistorySize;
1146     ss->set_size(size_history_[slot]);
1147   }
1148 
1149   return md;
1150 }
1151 
GetStats()1152 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1153   mutex_lock l(lock_);
1154   return stats_;
1155 }
1156 
ClearStats()1157 void BFCAllocator::ClearStats() {
1158   mutex_lock l(lock_);
1159   stats_.num_allocs = 0;
1160   stats_.peak_bytes_in_use = stats_.bytes_in_use;
1161   stats_.largest_alloc_size = 0;
1162 }
1163 
1164 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
get_bin_debug_info()1165 BFCAllocator::get_bin_debug_info() {
1166   std::array<BinDebugInfo, kNumBins> bin_infos;
1167   for (const auto& region : region_manager_.regions()) {
1168     ChunkHandle h = region_manager_.get_handle(region.ptr());
1169     while (h != kInvalidChunkHandle) {
1170       const Chunk* c = ChunkFromHandle(h);
1171       BinNum bin_num = BinNumForSize(c->size);
1172       BinDebugInfo& bin_info = bin_infos[bin_num];
1173       bin_info.total_bytes_in_bin += c->size;
1174       bin_info.total_chunks_in_bin++;
1175       if (c->in_use()) {
1176         bin_info.total_bytes_in_use += c->size;
1177         bin_info.total_requested_bytes_in_use += c->requested_size;
1178         bin_info.total_chunks_in_use++;
1179       } else {
1180         Bin* bin = BinFromIndex(bin_num);
1181         CHECK_EQ(bin->free_chunks.count(h), 1);
1182         CHECK_EQ(c->bin_num, bin_num);
1183       }
1184       h = c->next;
1185     }
1186   }
1187   return bin_infos;
1188 }
1189 
1190 }  // namespace tensorflow
1191