1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // CUDA virtual memory API is only available in CUDA versions greater than 10.2.
17 
18 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_
19 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_
20 
21 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
22 #include "tensorflow/core/framework/allocator.h"
23 #include "tensorflow/core/platform/stream_executor.h"
24 #include "tensorflow/stream_executor/lib/statusor.h"
25 
26 #if GOOGLE_CUDA
27 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
28 #include "tensorflow/stream_executor/gpu/gpu_types.h"
29 #endif
30 
31 #if CUDA_VERSION >= 10020
32 
33 namespace tensorflow {
34 
35 // GpuVirtualMemAllocator is a SubAllocator for use with BFCAllocator which
36 // provides contiguous allocations with each call to Alloc. This is done by
37 // reserving a large chunk of virtual addresses at construction and then mapping
38 // physical memory pages to this virtual address range as requested.
39 //
40 // This class is not thread-safe.
41 class GpuVirtualMemAllocator : public SubAllocator {
42  public:
43   static stream_executor::port::StatusOr<
44       std::unique_ptr<GpuVirtualMemAllocator>>
45   Create(const std::vector<Visitor>& alloc_visitors,
46          const std::vector<Visitor>& free_visitors,
47          stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
48          size_t virtual_address_space_size,
49          const std::vector<PlatformGpuId>& peer_gpu_ids);
50   ~GpuVirtualMemAllocator() override;
51 
52   // Allocates memory at least as large as requested by num_bytes. Will be
53   // aligned to the min allocation granularity (typically 2MiB).
54   // alignment is ignored by this allocator.
55   void* Alloc(size_t alignment, size_t num_bytes,
56               size_t* bytes_received) override;
57 
58   // Frees should only happen at the end of the contiguous memory allocations or
59   // else we introduce pointless fragmentation...But, this is supported. If the
60   // allocation happens at the end, then the next_alloc_offset_ is moved back,
61   // otherwise a hole is created.
62   //
63   // Holes are not re-used, all allocations continue to come at the end of the
64   // next_alloc_offset_. To accommodate this, the virtual_address_space_size
65   // should be much larger than the max physical size of the allocator.
66   //
67   // In practice, since the BFC allocator coalesces adjacent AllocationRegions,
68   // this free function should never be invoked.
69   void Free(void* ptr, size_t num_bytes) override;
70 
SupportsCoalescing()71   bool SupportsCoalescing() const override { return true; }
72 
73  private:
74   GpuVirtualMemAllocator(
75       const std::vector<Visitor>& alloc_visitors,
76       const std::vector<Visitor>& free_visitors,
77       stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
78       std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
79       stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
80 
81   stream_executor::gpu::GpuContext& gpu_context_;
82   PlatformGpuId gpu_id_;
83 
84   // Peer access is configured at mmap time so the allocator must be aware of
85   // all gpus that may want to read the memory. This list also includes the
86   // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
87   // function.
88   const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_;
89 
90   // The virtual memory span held by this allocator.
91   stream_executor::gpu::GpuDriver::VmemSpan vmem_;
92   // The next offset from the vmem base address that will be allocated. This
93   // corresponds to the size of physically pinned memory if holes haven't been
94   // created with "free".
95   size_t next_alloc_offset_ = 0;
96 
97   // Smallest allocation as determined by CUDA.
98   const size_t granularity_;
99 
100   struct Mapping {
101     stream_executor::gpu::GpuDevicePtr va;
102     stream_executor::gpu::GpuDriver::GenericMemoryHandle physical;
103   };
104   // List of mappings, sorted by va.
105   std::vector<Mapping> mappings_;
106 
107   TF_DISALLOW_COPY_AND_ASSIGN(GpuVirtualMemAllocator);
108 };
109 
110 }  // namespace tensorflow
111 
112 #endif  // CUDA_VERSION >= 10200
113 
114 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_
115