1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
18 
19 #include <stdlib.h>
20 
21 #include <limits>
22 
23 #include "absl/strings/string_view.h"
24 #include "absl/types/optional.h"
25 #include "tensorflow/core/framework/numeric_types.h"
26 #include "tensorflow/core/framework/resource_handle.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/mutex.h"
30 #include "tensorflow/core/platform/numa.h"
31 #include "tensorflow/core/platform/types.h"
32 
33 namespace tensorflow {
34 
35 class Variant;
36 
37 // Attributes for a single allocation call. Different calls to the same
38 // allocator could potentially have different allocation attributes.
39 struct AllocationAttributes {
40   // If the first attempt to allocate the memory fails, the allocation
41   // should return immediately without retrying.
42   // An example use case is optional scratch spaces where a failure
43   // has only performance impact.
44   bool no_retry_on_failure = false;
45   // If a Tensor is allocated without the following set to true, then
46   // it is logged as an unknown allocation. During execution Tensors
47   // should be allocated through the OpKernelContext which records
48   // which Op is performing the allocation, and sets this flag to
49   // true.
50   bool allocation_will_be_logged = false;
51   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
52   // a memory chunk whose last-freed count is at this value or earlier may be
53   // returned.
54   std::function<uint64()> freed_by_func = nullptr;
55 };
56 
57 // Runtime statistics collected by an allocator. Exactly the same as
58 // stream_executor::AllocatorStats, but independently defined to preserve the
59 // mutual independence of StreamExecutor and TensorFlow.
60 struct AllocatorStats {
61   int64 num_allocs;          // Number of allocations.
62   int64 bytes_in_use;        // Number of bytes in use.
63   int64 peak_bytes_in_use;   // The peak bytes in use.
64   int64 largest_alloc_size;  // The largest single allocation seen.
65 
66   // The upper limit of bytes of user allocatable device memory, if such a limit
67   // is known.
68   absl::optional<int64> bytes_limit;
69 
AllocatorStatsAllocatorStats70   AllocatorStats()
71       : num_allocs(0),
72         bytes_in_use(0),
73         peak_bytes_in_use(0),
74         largest_alloc_size(0) {}
75 
76   string DebugString() const;
77 };
78 
79 // Allocator is an abstract interface for allocating and deallocating
80 // device memory.
81 class Allocator {
82  public:
83   // Align to 64 byte boundary.
84   static constexpr size_t kAllocatorAlignment = 64;
85 
86   virtual ~Allocator();
87 
88   // Return a string identifying this allocator
89   virtual string Name() = 0;
90 
91   // Return an uninitialized block of memory that is "num_bytes" bytes
92   // in size.  The returned pointer is guaranteed to be aligned to a
93   // multiple of "alignment" bytes.
94   // REQUIRES: "alignment" is a power of 2.
95   virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0;
96 
97   // Return an uninitialized block of memory that is "num_bytes" bytes
98   // in size with specified allocation attributes.  The returned pointer is
99   // guaranteed to be aligned to a multiple of "alignment" bytes.
100   // REQUIRES: "alignment" is a power of 2.
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)101   virtual void* AllocateRaw(size_t alignment, size_t num_bytes,
102                             const AllocationAttributes& allocation_attr) {
103     // The default behavior is to use the implementation without any allocation
104     // attributes.
105     return AllocateRaw(alignment, num_bytes);
106   }
107 
108   // Deallocate a block of memory pointer to by "ptr"
109   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
110   virtual void DeallocateRaw(void* ptr) = 0;
111 
112   // Convenience functions to do typed allocation.  C++ constructors
113   // and destructors are invoked for complex types if necessary,
114   // depending on the concrete Allocator implementation. May return
115   // NULL if the tensor has too many elements to represent in a single
116   // allocation.
117   template <typename T>
Allocate(size_t num_elements)118   T* Allocate(size_t num_elements) {
119     return Allocate<T>(num_elements, AllocationAttributes());
120   }
121 
122   template <typename T>
Allocate(size_t num_elements,const AllocationAttributes & allocation_attr)123   T* Allocate(size_t num_elements,
124               const AllocationAttributes& allocation_attr) {
125     // TODO(jeff): Do we need to allow clients to pass in alignment
126     // requirements?
127 
128     if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
129       return NULL;
130     }
131 
132     void* p = AllocateRaw(kAllocatorAlignment, sizeof(T) * num_elements,
133                           allocation_attr);
134     T* typed_p = reinterpret_cast<T*>(p);
135     if (typed_p) RunCtor<T>(typed_p, num_elements);
136     return typed_p;
137   }
138 
139   template <typename T>
Deallocate(T * ptr,size_t num_elements)140   void Deallocate(T* ptr, size_t num_elements) {
141     if (ptr) {
142       RunDtor<T>(ptr, num_elements);
143       DeallocateRaw(ptr);
144     }
145   }
146 
147   // Returns true if this allocator tracks the sizes of allocations.
148   // RequestedSize and AllocatedSize must be overridden if
149   // TracksAllocationSizes is overridden to return true.
TracksAllocationSizes()150   virtual bool TracksAllocationSizes() { return false; }
151 
152   // Returns true if this allocator requires tensors with 0 elements
153   // to allocate buffers. This is false for most allocators, but may
154   // be used by special-case allocators that want to track tensor
155   // usage.
ShouldAllocateEmptyTensors()156   virtual bool ShouldAllocateEmptyTensors() { return false; }
157 
158   // Returns the user-requested size of the data allocated at
159   // 'ptr'.  Note that the actual buffer allocated might be larger
160   // than requested, but this function returns the size requested by
161   // the user.
162   //
163   // REQUIRES: TracksAllocationSizes() is true.
164   //
165   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
166   // allocated by this allocator.
RequestedSize(const void * ptr)167   virtual size_t RequestedSize(const void* ptr) {
168     CHECK(false) << "allocator doesn't track sizes";
169     return size_t(0);
170   }
171 
172   // Returns the allocated size of the buffer at 'ptr' if known,
173   // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is
174   // guaranteed to be >= RequestedSize(ptr).
175   //
176   // REQUIRES: TracksAllocationSizes() is true.
177   //
178   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
179   // allocated by this allocator.
AllocatedSize(const void * ptr)180   virtual size_t AllocatedSize(const void* ptr) { return RequestedSize(ptr); }
181 
182   // Returns either 0 or an identifier assigned to the buffer at 'ptr'
183   // when the buffer was returned by AllocateRaw. If non-zero, the
184   // identifier differs from every other ID assigned by this
185   // allocator.
186   //
187   // REQUIRES: TracksAllocationSizes() is true.
188   //
189   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
190   // allocated by this allocator.
AllocationId(const void * ptr)191   virtual int64 AllocationId(const void* ptr) { return 0; }
192 
193   // Returns the allocated size of the buffer at 'ptr' if known,
194   // otherwise returns 0. This method can be called when
195   // TracksAllocationSizes() is false, but can be extremely slow.
196   //
197   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
198   // allocated by this allocator.
AllocatedSizeSlow(const void * ptr)199   virtual size_t AllocatedSizeSlow(const void* ptr) {
200     if (TracksAllocationSizes()) {
201       return AllocatedSize(ptr);
202     }
203     return 0;
204   }
205 
206   // Fills in 'stats' with statistics collected by this allocator.
GetStats()207   virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
208 
209   // Clears the internal stats except for the `in_use` field.
ClearStats()210   virtual void ClearStats() {}
211 
212  private:
213   // No constructors or destructors are run for simple types
214   template <typename T>
RunCtor(T * p,size_t n)215   void RunCtor(T* p, size_t n) {
216     static_assert(is_simple_type<T>::value, "T is not a simple type.");
217   }
218 
219   template <typename T>
RunDtor(T * p,size_t n)220   void RunDtor(T* p, size_t n) {}
221 
222   // custom constructors and destructors that can be overridden for
223   // non-standard allocators
224 
225   // Runs string's default constructor for  p[0], p[1], ..., p[n-1].
RunStringCtor(string * p,size_t n)226   virtual void RunStringCtor(string* p, size_t n) {
227     for (size_t i = 0; i < n; ++p, ++i) new (p) string();
228   }
229 
230   // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
RunStringDtor(string * p,size_t n)231   virtual void RunStringDtor(string* p, size_t n) {
232     for (size_t i = 0; i < n; ++p, ++i) p->~string();
233   }
234 
RunResourceCtor(ResourceHandle * p,size_t n)235   virtual void RunResourceCtor(ResourceHandle* p, size_t n) {
236     for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
237   }
238 
239   // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
RunResourceDtor(ResourceHandle * p,size_t n)240   virtual void RunResourceDtor(ResourceHandle* p, size_t n) {
241     for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
242   }
243 
244   virtual void RunVariantCtor(Variant* p, size_t n);
245 
246   virtual void RunVariantDtor(Variant* p, size_t n);
247 
248   // TODO(jeff): Maybe provide some interface to give info about
249   // current allocation state (total number of bytes available for
250   // allocation, number of bytes free on device, etc.)
251 };
252 
253 // Allocator-specific constructors and destructors are used for
254 // strings
255 template <>
RunCtor(string * p,size_t n)256 inline void Allocator::RunCtor(string* p, size_t n) {
257   RunStringCtor(p, n);
258 }
259 
260 template <>
RunDtor(string * p,size_t n)261 inline void Allocator::RunDtor(string* p, size_t n) {
262   RunStringDtor(p, n);
263 }
264 
265 template <>
RunCtor(ResourceHandle * p,size_t n)266 inline void Allocator::RunCtor(ResourceHandle* p, size_t n) {
267   RunResourceCtor(p, n);
268 }
269 
270 template <>
RunDtor(ResourceHandle * p,size_t n)271 inline void Allocator::RunDtor(ResourceHandle* p, size_t n) {
272   RunResourceDtor(p, n);
273 }
274 
275 template <>
RunCtor(Variant * p,size_t n)276 inline void Allocator::RunCtor(Variant* p, size_t n) {
277   RunVariantCtor(p, n);
278 }
279 
280 template <>
RunDtor(Variant * p,size_t n)281 inline void Allocator::RunDtor(Variant* p, size_t n) {
282   RunVariantDtor(p, n);
283 }
284 
285 // An implementation of Allocator that delegates all calls to another Allocator.
286 //
287 // Useful to clients who want to override part of the functionality of another
288 // allocator.
289 class AllocatorWrapper : public Allocator {
290  public:
AllocatorWrapper(Allocator * wrapped)291   explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
292 
~AllocatorWrapper()293   ~AllocatorWrapper() override {}
294 
295   // Returns the wrapped allocator to which all calls are delegated.
wrapped()296   Allocator* wrapped() const { return wrapped_; }
297 
Name()298   string Name() override { return wrapped_->Name(); }
299 
AllocateRaw(size_t alignment,size_t num_bytes)300   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
301     return wrapped_->AllocateRaw(alignment, num_bytes);
302   }
303 
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)304   void* AllocateRaw(size_t alignment, size_t num_bytes,
305                     const AllocationAttributes& allocation_attr) override {
306     return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr);
307   }
308 
DeallocateRaw(void * ptr)309   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
310 
TracksAllocationSizes()311   bool TracksAllocationSizes() override {
312     return wrapped_->TracksAllocationSizes();
313   }
314 
ShouldAllocateEmptyTensors()315   bool ShouldAllocateEmptyTensors() override {
316     return wrapped_->TracksAllocationSizes();
317   }
318 
RequestedSize(const void * ptr)319   size_t RequestedSize(const void* ptr) override {
320     return wrapped_->RequestedSize(ptr);
321   }
322 
AllocatedSize(const void * ptr)323   size_t AllocatedSize(const void* ptr) override {
324     return wrapped_->AllocatedSize(ptr);
325   }
326 
AllocationId(const void * ptr)327   int64 AllocationId(const void* ptr) override {
328     return wrapped_->AllocationId(ptr);
329   }
330 
AllocatedSizeSlow(const void * ptr)331   size_t AllocatedSizeSlow(const void* ptr) override {
332     return wrapped_->AllocatedSizeSlow(ptr);
333   }
334 
335  private:
336   Allocator* const wrapped_;
337 };
338 
339 // A tensorflow Op may need access to different kinds of memory that
340 // are not simply a function of the device to which the Op has been
341 // assigned.  For example, an Op executing on a GPU may still need
342 // to allocate CPU RAM for some purpose.  Internal to the tensorflow
343 // runtime we may choose to allocate CPU ram from special regions
344 // that have been prepared for higher performance in some use
345 // contexts, e.g. doing DMA with particular devices.  For these
346 // reasons, the Device interface does not expose just one memory
347 // Allocator, but instead provides an accessor that takes a
348 // specification of the desired memory attributes in order to select
349 // an Allocator.
350 //
351 // Example use:
352 //  // Allocator for ordinary device memory:
353 //  Allocator* a = allocator(AllocatorAttributes());
354 // ...
355 //  // Allocator for CPU RAM, regardless of where Op is executing:
356 //  AllocatorAttributes attr;
357 //  attr.set_on_host(true);
358 //  Allocator* a = allocator(attr);
359 struct AllocatorAttributes {
set_on_hostAllocatorAttributes360   void set_on_host(bool v) { value |= (static_cast<int>(v)); }
on_hostAllocatorAttributes361   bool on_host() const { return value & 0x1; }
set_nic_compatibleAllocatorAttributes362   void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); }
nic_compatibleAllocatorAttributes363   bool nic_compatible() const { return value & (0x1 << 1); }
set_gpu_compatibleAllocatorAttributes364   void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
gpu_compatibleAllocatorAttributes365   bool gpu_compatible() const { return value & (0x1 << 2); }
MergeAllocatorAttributes366   void Merge(AllocatorAttributes other) {
367     value |= other.value;
368     scope_id = (scope_id > 0 && other.scope_id == 0)
369                    ? scope_id
370                    : ((scope_id == 0) ? other.scope_id : 0);
371   }
372   // Returns true if the fields set in *this is a subset of or equal to
373   // those set in other.
IsEqualOrLessRestrictiveThanAllocatorAttributes374   bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
375     return (value | other.value) == other.value;
376   }
377 
378   // NOTE: The upper 8 bits of the value are reserved for
379   // device-specific uses.  Implementors of a device can interpret these
380   // upper 8 bits in device-specific ways, and ops implemented for those
381   // devices are responsible for setting those 8 bits appropriately.
382   uint32 value = 0;
383   // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
384   // a named special-purpose allocator on the same device.
385   int32 scope_id = 0;
386 };
387 
388 // Returns a trivial implementation of Allocator, which is a process singleton.
389 // Access through this function is only intended for use by restricted parts
390 // of the infrastructure.
391 Allocator* cpu_allocator_base();
392 
393 // If available, calls ProcessState::GetCPUAllocator(numa_node).
394 // If not, falls back to cpu_allocator_base().
395 // Intended for use in contexts where ProcessState is not visible at
396 // compile time. Where ProcessState is visible, it's preferable to
397 // call it directly.
398 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
399 
400 // If 'enable' is true, the default CPU allocator implementation will collect
401 // AllocatorStats. By default, it's disabled.
402 void EnableCPUAllocatorStats(bool enable);
403 bool CPUAllocatorStatsEnabled();
404 
405 // If 'enable' is true, the default CPU allocator implementation will collect
406 // full statistics. By default, it's disabled.
407 void EnableCPUAllocatorFullStats(bool enable);
408 bool CPUAllocatorFullStatsEnabled();
409 
410 // An object that does the underlying suballoc/free of memory for a higher-level
411 // allocator.  The expectation is that the higher-level allocator is doing some
412 // kind of cache or pool management so that it will call SubAllocator::Alloc and
413 // Free relatively infrequently, compared to the number of times its own
414 // AllocateRaw and Free methods are called.
415 class SubAllocator {
416  public:
417   // Visitor gets called with a pointer to a memory area and its
418   // size in bytes.  The index value will be numa_node for a CPU
419   // allocator and GPU id for a GPU allocator.
420   typedef std::function<void(void*, int index, size_t)> Visitor;
421 
422   SubAllocator(const std::vector<Visitor>& alloc_visitors,
423                const std::vector<Visitor>& free_visitors);
424 
~SubAllocator()425   virtual ~SubAllocator() {}
426   virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
427   virtual void Free(void* ptr, size_t num_bytes) = 0;
428 
429  protected:
430   // Implementation of Alloc() method must call this on newly allocated
431   // value.
432   void VisitAlloc(void* ptr, int index, size_t num_bytes);
433 
434   // Implementation of Free() method must call this on value to be
435   // freed immediately before deallocation.
436   void VisitFree(void* ptr, int index, size_t num_bytes);
437 
438   const std::vector<Visitor> alloc_visitors_;
439   const std::vector<Visitor> free_visitors_;
440 };
441 
442 }  // namespace tensorflow
443 
444 #endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
445