1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
18 
19 #include <stdlib.h>
20 
21 #include <functional>
22 #include <limits>
23 
24 #include "absl/strings/string_view.h"
25 #include "absl/types/optional.h"
26 #include "tensorflow/core/framework/numeric_types.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/platform/numa.h"
31 #include "tensorflow/core/platform/types.h"
32 
33 namespace tensorflow {
34 
35 class TensorShape;
36 
37 // Attributes for a single allocation call. Different calls to the same
38 // allocator could potentially have different allocation attributes.
39 struct AllocationAttributes {
40   AllocationAttributes() = default;
41 
AllocationAttributesAllocationAttributes42   AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
43                        std::function<uint64()>* freed_by_func)
44       : retry_on_failure(retry_on_failure),
45         allocation_will_be_logged(allocation_will_be_logged),
46         freed_by_func(freed_by_func) {}
47 
48   // If the first attempt to allocate the memory fails, the allocation should
49   // wait and retry (with a timeout).
50   //
51   // This is usually set to true, but we may set it to false in cases where a
52   // failure has only performance impact (e.g. optional scratch space
53   // allocation).
54   bool retry_on_failure = true;
55   // If a Tensor is allocated without the following set to true, then
56   // it is logged as an unknown allocation. During execution Tensors
57   // should be allocated through the OpKernelContext which records
58   // which Op is performing the allocation, and sets this flag to
59   // true.
60   bool allocation_will_be_logged = false;
61   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
62   // a memory chunk whose freed_at_count is at this value or earlier may be
63   // returned.
64   std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
65 
66   TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
67 };
68 
69 // Annotations for memory profiling and debugging purpose. The runtime will
70 // cache the annotations in thread-local memory, and some allocators will try to
71 // tag allocations with the annotations.
72 struct MemoryDebugAnnotation {
73   const char* pending_op_name = nullptr;
74   int64 pending_step_id = 0;
75   const char* pending_region_type = nullptr;
76   int32 pending_data_type = 0;
77   const TensorShape* pending_shape = nullptr;
78 };
79 
80 // Wrapper class of MemoryDebugAnnotation for RAII.
81 class ScopedMemoryDebugAnnotation {
82  public:
CurrentAnnotation()83   static const MemoryDebugAnnotation& CurrentAnnotation() {
84     return annotation_;
85   }
86 
ScopedMemoryDebugAnnotation(const char * op_name)87   explicit ScopedMemoryDebugAnnotation(const char* op_name) {
88     last_annotation_ = annotation_;
89     CleanupAnnotation();
90     annotation_.pending_op_name = op_name;
91   }
92 
ScopedMemoryDebugAnnotation(const char * op_name,int64 step_id)93   explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id) {
94     last_annotation_ = annotation_;
95     CleanupAnnotation();
96     annotation_.pending_op_name = op_name;
97     annotation_.pending_step_id = step_id;
98   }
99 
100   // This constructor keeps the pending_op_name and pending_step_id from parent
101   // (if any).  Otherwise it overwrites with op_name.
ScopedMemoryDebugAnnotation(const char * op_name,const char * region_type,int32 data_type,const TensorShape * shape)102   explicit ScopedMemoryDebugAnnotation(const char* op_name,
103                                        const char* region_type, int32 data_type,
104                                        const TensorShape* shape) {
105     last_annotation_ = annotation_;
106     if (!annotation_.pending_op_name) {
107       annotation_.pending_op_name = op_name;
108     }
109     annotation_.pending_region_type = region_type;
110     annotation_.pending_data_type = data_type;
111     annotation_.pending_shape = shape;
112   }
113 
ScopedMemoryDebugAnnotation(const char * op_name,int64 step_id,const char * region_type,int32 data_type,const TensorShape * shape)114   explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id,
115                                        const char* region_type, int32 data_type,
116                                        const TensorShape* shape) {
117     last_annotation_ = annotation_;
118     annotation_.pending_op_name = op_name;
119     annotation_.pending_step_id = step_id;
120     annotation_.pending_region_type = region_type;
121     annotation_.pending_data_type = data_type;
122     annotation_.pending_shape = shape;
123   }
124 
~ScopedMemoryDebugAnnotation()125   ~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; }
126 
127  private:
CleanupAnnotation()128   void CleanupAnnotation() {
129     annotation_.pending_op_name = nullptr;
130     annotation_.pending_step_id = 0;
131     annotation_.pending_region_type = nullptr;
132     annotation_.pending_data_type = 0;
133     annotation_.pending_shape = nullptr;
134   }
135 
136   // Stores the current annotations.
137   static thread_local MemoryDebugAnnotation annotation_;
138 
139   // Stores the previous values in case the annotations are nested.
140   MemoryDebugAnnotation last_annotation_;
141 
142   TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation);
143 };
144 
145 // Runtime statistics collected by an allocator. Exactly the same as
146 // stream_executor::AllocatorStats, but independently defined to preserve the
147 // mutual independence of StreamExecutor and TensorFlow.
148 struct AllocatorStats {
149   int64 num_allocs;          // Number of allocations.
150   int64 bytes_in_use;        // Number of bytes in use.
151   int64 peak_bytes_in_use;   // The peak bytes in use.
152   int64 largest_alloc_size;  // The largest single allocation seen.
153 
154   // The upper limit of bytes of user allocatable device memory, if such a limit
155   // is known.
156   absl::optional<int64> bytes_limit;
157 
158   // Stats for reserved memory usage.
159   int64 bytes_reserved;       // Number of bytes reserved.
160   int64 peak_bytes_reserved;  // The peak number of bytes reserved.
161   // The upper limit on the number bytes of reservable memory,
162   // if such a limit is known.
163   absl::optional<int64> bytes_reservable_limit;
164 
165   int64 largest_free_block_bytes;  // Largest free block's size in heap.
166 
AllocatorStatsAllocatorStats167   AllocatorStats()
168       : num_allocs(0),
169         bytes_in_use(0),
170         peak_bytes_in_use(0),
171         largest_alloc_size(0),
172         bytes_reserved(0),
173         peak_bytes_reserved(0),
174         largest_free_block_bytes(0) {}
175 
176   std::string DebugString() const;
177 };
178 
179 // Allocator is an abstract interface for allocating and deallocating
180 // device memory.
181 class Allocator {
182  public:
183   // Align to 64 byte boundary.
184   static constexpr size_t kAllocatorAlignment = 64;
185 
186   virtual ~Allocator();
187 
188   // Return a string identifying this allocator
189   virtual std::string Name() = 0;
190 
191   // Return an uninitialized block of memory that is "num_bytes" bytes
192   // in size.  The returned pointer is guaranteed to be aligned to a
193   // multiple of "alignment" bytes.
194   // REQUIRES: "alignment" is a power of 2.
195   virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0;
196 
197   // Return an uninitialized block of memory that is "num_bytes" bytes
198   // in size with specified allocation attributes.  The returned pointer is
199   // guaranteed to be aligned to a multiple of "alignment" bytes.
200   // REQUIRES: "alignment" is a power of 2.
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)201   virtual void* AllocateRaw(size_t alignment, size_t num_bytes,
202                             const AllocationAttributes& allocation_attr) {
203     // The default behavior is to use the implementation without any allocation
204     // attributes.
205     return AllocateRaw(alignment, num_bytes);
206   }
207 
208   // Deallocate a block of memory pointer to by "ptr"
209   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
210   virtual void DeallocateRaw(void* ptr) = 0;
211 
212   // Returns true if this allocator tracks the sizes of allocations.
213   // RequestedSize and AllocatedSize must be overridden if
214   // TracksAllocationSizes is overridden to return true.
TracksAllocationSizes()215   virtual bool TracksAllocationSizes() const { return false; }
216 
217   // Returns true if this allocator allocates an opaque handle rather than the
218   // requested number of bytes.
219   //
220   // This method returns false for most allocators, but may be used by
221   // special-case allocators that track tensor usage. If this method returns
222   // true, AllocateRaw() should be invoked for all values of `num_bytes`,
223   // including 0.
224   //
225   // NOTE: It is the caller's responsibility to track whether an allocated
226   // object is a buffer or an opaque handle. In particular, when this method
227   // returns `true`, users of this allocator must not run any constructors or
228   // destructors for complex objects, since there is no backing store for the
229   // tensor in which to place their outputs.
AllocatesOpaqueHandle()230   virtual bool AllocatesOpaqueHandle() const { return false; }
231 
232   // Returns the user-requested size of the data allocated at
233   // 'ptr'.  Note that the actual buffer allocated might be larger
234   // than requested, but this function returns the size requested by
235   // the user.
236   //
237   // REQUIRES: TracksAllocationSizes() is true.
238   //
239   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
240   // allocated by this allocator.
RequestedSize(const void * ptr)241   virtual size_t RequestedSize(const void* ptr) const {
242     CHECK(false) << "allocator doesn't track sizes";
243     return size_t(0);
244   }
245 
246   // Returns the allocated size of the buffer at 'ptr' if known,
247   // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is
248   // guaranteed to be >= RequestedSize(ptr).
249   //
250   // REQUIRES: TracksAllocationSizes() is true.
251   //
252   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
253   // allocated by this allocator.
AllocatedSize(const void * ptr)254   virtual size_t AllocatedSize(const void* ptr) const {
255     return RequestedSize(ptr);
256   }
257 
258   // Returns either 0 or an identifier assigned to the buffer at 'ptr'
259   // when the buffer was returned by AllocateRaw. If non-zero, the
260   // identifier differs from every other ID assigned by this
261   // allocator.
262   //
263   // REQUIRES: TracksAllocationSizes() is true.
264   //
265   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
266   // allocated by this allocator.
AllocationId(const void * ptr)267   virtual int64 AllocationId(const void* ptr) const { return 0; }
268 
269   // Returns the allocated size of the buffer at 'ptr' if known,
270   // otherwise returns 0. This method can be called when
271   // TracksAllocationSizes() is false, but can be extremely slow.
272   //
273   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
274   // allocated by this allocator.
AllocatedSizeSlow(const void * ptr)275   virtual size_t AllocatedSizeSlow(const void* ptr) const {
276     if (TracksAllocationSizes()) {
277       return AllocatedSize(ptr);
278     }
279     return 0;
280   }
281 
282   // Fills in 'stats' with statistics collected by this allocator.
GetStats()283   virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
284 
285   // Clears the internal stats except for the `in_use` field.
ClearStats()286   virtual void ClearStats() {}
287 
SetSafeFrontier(uint64 count)288   virtual void SetSafeFrontier(uint64 count) {}
289 };
290 
291 // An implementation of Allocator that delegates all calls to another Allocator.
292 //
293 // Useful to clients who want to override part of the functionality of another
294 // allocator.
295 class AllocatorWrapper : public Allocator {
296  public:
AllocatorWrapper(Allocator * wrapped)297   explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
298 
~AllocatorWrapper()299   ~AllocatorWrapper() override {}
300 
301   // Returns the wrapped allocator to which all calls are delegated.
wrapped()302   Allocator* wrapped() const { return wrapped_; }
303 
Name()304   std::string Name() override { return wrapped_->Name(); }
305 
AllocateRaw(size_t alignment,size_t num_bytes)306   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
307     return wrapped_->AllocateRaw(alignment, num_bytes);
308   }
309 
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)310   void* AllocateRaw(size_t alignment, size_t num_bytes,
311                     const AllocationAttributes& allocation_attr) override {
312     return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr);
313   }
314 
DeallocateRaw(void * ptr)315   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
316 
TracksAllocationSizes()317   bool TracksAllocationSizes() const override {
318     return wrapped_->TracksAllocationSizes();
319   }
320 
AllocatesOpaqueHandle()321   bool AllocatesOpaqueHandle() const override {
322     return wrapped_->AllocatesOpaqueHandle();
323   }
324 
RequestedSize(const void * ptr)325   size_t RequestedSize(const void* ptr) const override {
326     return wrapped_->RequestedSize(ptr);
327   }
328 
AllocatedSize(const void * ptr)329   size_t AllocatedSize(const void* ptr) const override {
330     return wrapped_->AllocatedSize(ptr);
331   }
332 
AllocationId(const void * ptr)333   int64 AllocationId(const void* ptr) const override {
334     return wrapped_->AllocationId(ptr);
335   }
336 
AllocatedSizeSlow(const void * ptr)337   size_t AllocatedSizeSlow(const void* ptr) const override {
338     return wrapped_->AllocatedSizeSlow(ptr);
339   }
340 
341  private:
342   Allocator* const wrapped_;
343 };
344 
345 // A tensorflow Op may need access to different kinds of memory that
346 // are not simply a function of the device to which the Op has been
347 // assigned.  For example, an Op executing on a GPU may still need
348 // to allocate CPU RAM for some purpose.  Internal to the tensorflow
349 // runtime we may choose to allocate CPU ram from special regions
350 // that have been prepared for higher performance in some use
351 // contexts, e.g. doing DMA with particular devices.  For these
352 // reasons, the Device interface does not expose just one memory
353 // Allocator, but instead provides an accessor that takes a
354 // specification of the desired memory attributes in order to select
355 // an Allocator.
356 //
357 // Example use:
358 //  // Allocator for ordinary device memory:
359 //  Allocator* a = allocator(AllocatorAttributes());
360 // ...
361 //  // Allocator for CPU RAM, regardless of where Op is executing:
362 //  AllocatorAttributes attr;
363 //  attr.set_on_host(true);
364 //  Allocator* a = allocator(attr);
365 struct AllocatorAttributes {
set_on_hostAllocatorAttributes366   void set_on_host(bool v) { value |= (static_cast<int>(v)); }
on_hostAllocatorAttributes367   bool on_host() const { return value & 0x1; }
set_nic_compatibleAllocatorAttributes368   void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); }
nic_compatibleAllocatorAttributes369   bool nic_compatible() const { return value & (0x1 << 1); }
set_gpu_compatibleAllocatorAttributes370   void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
gpu_compatibleAllocatorAttributes371   bool gpu_compatible() const { return value & (0x1 << 2); }
MergeAllocatorAttributes372   void Merge(AllocatorAttributes other) {
373     value |= other.value;
374     if (scope_id != other.scope_id) {
375       CHECK(scope_id == 0 || other.scope_id == 0)
376           << "At least one scope_id should be zero to merge "
377              "AllocatorAttributes but found this.scope_id="
378           << scope_id << " and other.scope_id=" << other.scope_id;
379       scope_id = scope_id == 0 ? other.scope_id : scope_id;
380     }
381   }
382   // Returns true if the fields set in *this is a subset of or equal to
383   // those set in other.
IsEqualOrLessRestrictiveThanAllocatorAttributes384   bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
385     return (value | other.value) == other.value;
386   }
387 
388   // NOTE: The upper 8 bits of the value are reserved for
389   // device-specific uses.  Implementors of a device can interpret these
390   // upper 8 bits in device-specific ways, and ops implemented for those
391   // devices are responsible for setting those 8 bits appropriately.
392   uint32 value = 0;
393   // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
394   // a named special-purpose allocator on the same device.
395   int32 scope_id = 0;
396 
397   // Returns a human readable representation of this.
398   std::string DebugString() const;
399 };
400 
401 // Returns a trivial implementation of Allocator, which is a process singleton.
402 // Access through this function is only intended for use by restricted parts
403 // of the infrastructure.
404 Allocator* cpu_allocator_base();
405 
406 // If available, calls ProcessState::GetCPUAllocator(numa_node).
407 // If not, falls back to cpu_allocator_base().
408 // Intended for use in contexts where ProcessState is not visible at
409 // compile time. Where ProcessState is visible, it's preferable to
410 // call it directly.
411 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
412 
413 // Enables AllocatorStats in the default CPU allocator implementation.  By
414 // default, it's disabled.
415 void EnableCPUAllocatorStats();
416 // Disables AllocatorStats in the default CPU allocator implementation.  By
417 // default, it's disabled.
418 void DisableCPUAllocatorStats();
419 bool CPUAllocatorStatsEnabled();
420 
421 // Enables full statistics collection in the default CPU allocator
422 // implementation.  By default, it's disabled.
423 void EnableCPUAllocatorFullStats();
424 bool CPUAllocatorFullStatsEnabled();
425 
426 // An object that does the underlying suballoc/free of memory for a higher-level
427 // allocator.  The expectation is that the higher-level allocator is doing some
428 // kind of cache or pool management so that it will call SubAllocator::Alloc and
429 // Free relatively infrequently, compared to the number of times its own
430 // AllocateRaw and Free methods are called.
431 class SubAllocator {
432  public:
433   // Visitor gets called with a pointer to a memory area and its
434   // size in bytes.  The index value will be numa_node for a CPU
435   // allocator and GPU id for a GPU allocator.
436   typedef std::function<void(void*, int index, size_t)> Visitor;
437 
438   SubAllocator(const std::vector<Visitor>& alloc_visitors,
439                const std::vector<Visitor>& free_visitors);
440 
~SubAllocator()441   virtual ~SubAllocator() {}
442   // Allocates at least num_bytes. Returns actual number of bytes allocated in
443   // bytes_received. The caller can safely use the full bytes_received sized
444   // buffer following the returend pointer.
445   virtual void* Alloc(size_t alignment, size_t num_bytes,
446                       size_t* bytes_received) = 0;
447   virtual void Free(void* ptr, size_t num_bytes) = 0;
448 
449   // Returns true if the BFC allocator can safely coalesce adjacent regions
450   // returned by this allocator.
451   virtual bool SupportsCoalescing() const = 0;
452 
453  protected:
454   // Implementation of Alloc() method must call this on newly allocated
455   // value.
456   void VisitAlloc(void* ptr, int index, size_t num_bytes);
457 
458   // Implementation of Free() method must call this on value to be
459   // freed immediately before deallocation.
460   void VisitFree(void* ptr, int index, size_t num_bytes);
461 
462   const std::vector<Visitor> alloc_visitors_;
463   const std::vector<Visitor> free_visitors_;
464 };
465 
466 }  // namespace tensorflow
467 
468 #endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
469