1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/framework/allocator.h"
17 
18 #include "tensorflow/core/framework/allocator_registry.h"
19 #include "tensorflow/core/framework/log_memory.h"
20 #include "tensorflow/core/framework/tracking_allocator.h"
21 #include "tensorflow/core/framework/variant.h"
22 #include "tensorflow/core/lib/strings/stringprintf.h"
23 #include "tensorflow/core/platform/mem.h"
24 #include "tensorflow/core/platform/mutex.h"
25 #include "tensorflow/core/platform/types.h"
26 
27 namespace tensorflow {
28 
DebugString() const29 string AllocatorStats::DebugString() const {
30   return strings::Printf(
31       "Limit:        %20lld\n"
32       "InUse:        %20lld\n"
33       "MaxInUse:     %20lld\n"
34       "NumAllocs:    %20lld\n"
35       "MaxAllocSize: %20lld\n",
36       this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
37       this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
38 }
39 
40 constexpr size_t Allocator::kAllocatorAlignment;
41 
~Allocator()42 Allocator::~Allocator() {}
43 
RunResourceCtor(ResourceHandle * p,size_t n)44 void RunResourceCtor(ResourceHandle* p, size_t n) {
45   for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
46 }
47 
RunResourceDtor(ResourceHandle * p,size_t n)48 void RunResourceDtor(ResourceHandle* p, size_t n) {
49   for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
50 }
51 
RunVariantCtor(Variant * p,size_t n)52 void Allocator::RunVariantCtor(Variant* p, size_t n) {
53   for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
54 }
55 
RunVariantDtor(Variant * p,size_t n)56 void Allocator::RunVariantDtor(Variant* p, size_t n) {
57   for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
58 }
59 
60 // If true, cpu allocator collects more stats.
61 static bool cpu_allocator_collect_stats = false;
62 // If true, cpu allocator collects full stats.
63 static bool cpu_allocator_collect_full_stats = false;
64 
65 // Individual allocations large than this amount will trigger a warning.
66 static const double kLargeAllocationWarningThreshold = 0.1;
67 
68 // If cpu_allocator_collect_stats is true, warn when the total allocated memory
69 // exceeds this threshold.
70 static const double kTotalAllocationWarningThreshold = 0.5;
71 
72 static const int kMaxSingleAllocationWarnings = 5;
73 static const int kMaxTotalAllocationWarnings = 1;
74 
75 // Cache first invocation to port::AvailableRam, as it can be expensive.
LargeAllocationWarningBytes()76 static int64_t LargeAllocationWarningBytes() {
77   static int64_t value = static_cast<int64>(port::AvailableRam() *
78                                             kLargeAllocationWarningThreshold);
79   return value;
80 }
81 
TotalAllocationWarningBytes()82 static int64_t TotalAllocationWarningBytes() {
83   static int64_t value = static_cast<int64>(port::AvailableRam() *
84                                             kTotalAllocationWarningThreshold);
85   return value;
86 }
87 
EnableCPUAllocatorStats(bool enable)88 void EnableCPUAllocatorStats(bool enable) {
89   cpu_allocator_collect_stats = enable;
90 }
CPUAllocatorStatsEnabled()91 bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
EnableCPUAllocatorFullStats(bool enable)92 void EnableCPUAllocatorFullStats(bool enable) {
93   cpu_allocator_collect_full_stats = enable;
94 }
CPUAllocatorFullStatsEnabled()95 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
96 
97 namespace {
98 // A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
99 // return a different version that may perform better, but may also lack the
100 // optional stats triggered by the functions above.  TODO(tucker): migrate all
101 // uses of cpu_allocator() except tests to use ProcessState instead.
102 class CPUAllocator : public Allocator {
103  public:
CPUAllocator()104   CPUAllocator()
105       : single_allocation_warning_count_(0),
106         total_allocation_warning_count_(0) {}
107 
~CPUAllocator()108   ~CPUAllocator() override {}
109 
Name()110   string Name() override { return "cpu"; }
111 
AllocateRaw(size_t alignment,size_t num_bytes)112   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
113     if (num_bytes > LargeAllocationWarningBytes() &&
114         single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
115       ++single_allocation_warning_count_;
116       LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
117                    << 100 * kLargeAllocationWarningThreshold
118                    << "% of system memory.";
119     }
120 
121     void* p = port::AlignedMalloc(num_bytes, alignment);
122     if (cpu_allocator_collect_stats) {
123       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
124       mutex_lock l(mu_);
125       ++stats_.num_allocs;
126       stats_.bytes_in_use += alloc_size;
127       stats_.peak_bytes_in_use =
128           std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
129       stats_.largest_alloc_size =
130           std::max<int64>(stats_.largest_alloc_size, alloc_size);
131 
132       if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
133           total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
134         ++total_allocation_warning_count_;
135         LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
136                      << "exceeds " << 100 * kTotalAllocationWarningThreshold
137                      << "% of system memory";
138       }
139     }
140     return p;
141   }
142 
DeallocateRaw(void * ptr)143   void DeallocateRaw(void* ptr) override {
144     if (cpu_allocator_collect_stats) {
145       const std::size_t alloc_size =
146           port::MallocExtension_GetAllocatedSize(ptr);
147       mutex_lock l(mu_);
148       stats_.bytes_in_use -= alloc_size;
149     }
150     port::AlignedFree(ptr);
151   }
152 
GetStats()153   absl::optional<AllocatorStats> GetStats() override {
154     mutex_lock l(mu_);
155     return stats_;
156   }
157 
ClearStats()158   void ClearStats() override {
159     mutex_lock l(mu_);
160     stats_.num_allocs = 0;
161     stats_.peak_bytes_in_use = stats_.bytes_in_use;
162     stats_.largest_alloc_size = 0;
163   }
164 
AllocatedSizeSlow(const void * ptr)165   size_t AllocatedSizeSlow(const void* ptr) override {
166     return port::MallocExtension_GetAllocatedSize(ptr);
167   }
168 
169  private:
170   mutex mu_;
171   AllocatorStats stats_ GUARDED_BY(mu_);
172 
173   // Use <atomic> for single allocations to avoid mutex contention when
174   // statistics are disabled.
175   std::atomic<int> single_allocation_warning_count_;
176   int total_allocation_warning_count_ GUARDED_BY(mu_);
177 
178   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
179 };
180 
181 class CPUAllocatorFactory : public AllocatorFactory {
182  public:
CreateAllocator()183   Allocator* CreateAllocator() override { return new CPUAllocator; }
184 
CreateSubAllocator(int numa_node)185   SubAllocator* CreateSubAllocator(int numa_node) override {
186     return new CPUSubAllocator(new CPUAllocator);
187   }
188 
189  private:
190   class CPUSubAllocator : public SubAllocator {
191    public:
CPUSubAllocator(CPUAllocator * cpu_allocator)192     explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
193         : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
194 
Alloc(size_t alignment,size_t num_bytes)195     void* Alloc(size_t alignment, size_t num_bytes) override {
196       return cpu_allocator_->AllocateRaw(alignment, num_bytes);
197     }
198 
Free(void * ptr,size_t num_bytes)199     void Free(void* ptr, size_t num_bytes) override {
200       cpu_allocator_->DeallocateRaw(ptr);
201     }
202 
203    private:
204     CPUAllocator* cpu_allocator_;
205   };
206 };
207 
208 REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
209 }  // namespace
210 
cpu_allocator_base()211 Allocator* cpu_allocator_base() {
212   static Allocator* cpu_alloc =
213       AllocatorFactoryRegistry::singleton()->GetAllocator();
214   // TODO(tucker): This really seems wrong.  It's only going to be effective on
215   // the first call in a process (but the desired effect is associated with a
216   // session), and we probably ought to be tracking the highest level Allocator,
217   // not the lowest.  Revisit the advertised semantics of the triggering option.
218   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
219     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
220   }
221   return cpu_alloc;
222 }
223 
cpu_allocator(int numa_node)224 Allocator* cpu_allocator(int numa_node) {
225   // Correctness relies on devices being created prior to the first call
226   // to cpu_allocator, if devices are ever to be created in the process.
227   // Device creation in turn triggers ProcessState creation and the availability
228   // of the correct access pointer via this function call.
229   static ProcessStateInterface* ps =
230       AllocatorFactoryRegistry::singleton()->process_state();
231   if (ps) {
232     return ps->GetCPUAllocator(numa_node);
233   } else {
234     return cpu_allocator_base();
235   }
236 }
237 
SubAllocator(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors)238 SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
239                            const std::vector<Visitor>& free_visitors)
240     : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
241 
VisitAlloc(void * ptr,int index,size_t num_bytes)242 void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) {
243   for (const auto& v : alloc_visitors_) {
244     v(ptr, index, num_bytes);
245   }
246 }
247 
VisitFree(void * ptr,int index,size_t num_bytes)248 void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) {
249   // Although we don't guarantee any order of visitor application, strive
250   // to apply free visitors in reverse order of alloc visitors.
251   for (int i = free_visitors_.size() - 1; i >= 0; --i) {
252     free_visitors_[i](ptr, index, num_bytes);
253   }
254 }
255 }  // namespace tensorflow
256