1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24 
25 #include <set>
26 #include <unordered_map>
27 
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/stream_executor/event.h"
30 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
31 #include "tensorflow/stream_executor/lib/status.h"
32 #include "tensorflow/stream_executor/lib/statusor.h"
33 #include "tensorflow/stream_executor/platform.h"
34 #include "tensorflow/stream_executor/platform/mutex.h"
35 #include "tensorflow/stream_executor/platform/port.h"
36 #include "tensorflow/stream_executor/platform/thread_annotations.h"
37 #include "tensorflow/stream_executor/stream_executor_internal.h"
38 
39 namespace stream_executor {
40 namespace gpu {
41 
42 // CUDA-platform implementation of the platform-agnostic
43 // StreamExecutorInferface.
44 class GpuExecutor : public internal::StreamExecutorInterface {
45  public:
46   // sub_platform indicates the subplatform used in this executor; it must
47   // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)48   explicit GpuExecutor(const PluginConfig& plugin_config)
49       : device_(0),
50         context_(nullptr),
51         device_ordinal_(0),
52         cc_major_(0),
53         cc_minor_(0),
54         version_(0),
55         plugin_config_(plugin_config) {}
56 
57   // See the corresponding StreamExecutor methods for method comments on the
58   // following overrides.
59 
60   ~GpuExecutor() override;
61 
62   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
63 
64   bool GetKernel(const MultiKernelLoaderSpec& spec,
65                  KernelBase* kernel) override;
66   // (supported on CUDA only)
67   void UnloadKernel(const KernelBase* kernel) override;
68   bool LoadModule(const MultiModuleLoaderSpec& spec,
69                   ModuleHandle* module_handle) override;
70   bool UnloadModule(ModuleHandle module_handle) override;
71 
72   bool Launch(Stream* stream, const ThreadDim& thread_dims,
73               const BlockDim& block_dims, const KernelBase& k,
74               const KernelArgsArrayBase& args) override;
75 
76   // (supported on CUDA only)
77   int CalculateOccupancy(const DeviceDescription& device_description,
78                          uint64 registers_per_thread,
79                          uint64 shared_memory_per_block,
80                          const ThreadDim& thread_dims, GpuFunctionHandle func);
81 
82   // (supported on CUDA only)
83   int CompareOccupancy(int* initial_blocks,
84                        const DeviceDescription& device_description,
85                        uint64 registers_per_thread,
86                        uint64 shared_memory_per_block,
87                        const ThreadDim& thread_dims, GpuFunctionHandle func);
88 
89   void* Allocate(uint64 size) override;
90 
91   void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
92                           uint64 size_bytes) override;
93 
94   void Deallocate(DeviceMemoryBase* mem) override;
95 
UnifiedMemoryAllocate(uint64 size)96   void* UnifiedMemoryAllocate(uint64 size) override {
97     return GpuDriver::UnifiedMemoryAllocate(context_, size);
98   }
99 
UnifiedMemoryDeallocate(void * location)100   void UnifiedMemoryDeallocate(void* location) override {
101     return GpuDriver::UnifiedMemoryDeallocate(context_, location);
102   }
103 
104   // CUDA allocation/registration functions are necessary because the driver
105   // internally sets up buffers for DMA operations (and page locks them).
106   // There's no external interface for us to otherwise control these DMA
107   // settings.
HostMemoryAllocate(uint64 size)108   void* HostMemoryAllocate(uint64 size) override {
109     return GpuDriver::HostAllocate(context_, size);
110   }
111 
HostMemoryDeallocate(void * location)112   void HostMemoryDeallocate(void* location) override {
113     return GpuDriver::HostDeallocate(context_, location);
114   }
115 
116   bool HostMemoryRegister(void* location, uint64 size) override;
117 
118   bool HostMemoryUnregister(void* location) override;
119 
120   bool SynchronizeAllActivity() override;
121 
122   bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
123 
124   bool SynchronousMemSet(DeviceMemoryBase* location, int value,
125                          uint64 size) override;
126 
127   port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
128                                  const void* host_src, uint64 size) override;
129 
130   port::Status SynchronousMemcpy(void* host_dst,
131                                  const DeviceMemoryBase& gpu_src,
132                                  uint64 size) override;
133 
134   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
135                                                const DeviceMemoryBase& gpu_src,
136                                                uint64 size) override;
137 
138   bool MemZero(Stream* stream, DeviceMemoryBase* location,
139                uint64 size) override;
140   bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
141               uint64 size) override;
142   bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
143                 uint64 size) override;
144 
145   bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
146               uint64 size) override;
147 
148   bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
149               uint64 size) override;
150 
151   bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
152                             const DeviceMemoryBase& gpu_src,
153                             uint64 size) override;
154 
155   bool HostCallback(Stream* stream,
156                     std::function<port::Status()> callback) override;
157 
158   bool AllocateStream(Stream* stream) override;
159 
160   void DeallocateStream(Stream* stream) override;
161 
162   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
163 
164   bool AllocateTimer(Timer* timer) override;
165 
166   void DeallocateTimer(Timer* timer) override;
167 
168   bool StartTimer(Stream* stream, Timer* timer) override;
169 
170   bool StopTimer(Stream* stream, Timer* timer) override;
171 
172   port::Status AllocateEvent(Event* event) override;
173 
174   port::Status DeallocateEvent(Event* event) override;
175 
176   port::Status RecordEvent(Stream* stream, Event* event) override;
177 
178   port::Status WaitForEvent(Stream* stream, Event* event) override;
179 
180   Event::Status PollForEventStatus(Event* event) override;
181 
182   port::Status BlockHostUntilDone(Stream* stream) override;
183 
PlatformDeviceCount()184   int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
185 
186   port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
187 
188   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
189 
190   SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
191 
192   port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
193 
194   bool DeviceMemoryUsage(int64* free, int64* total) const override;
195 
196   // Search for the symbol and returns a device pointer and size.
197   // Returns false if symbol does not exist.
198   bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
199                  void** mem, size_t* bytes) override;
200 
201   DeviceDescription* PopulateDeviceDescription() const override;
202 
203   // Populates the block_dim_limit by querying the device driver API. If an
204   // error occurs at any point while asking the driver for block dim limits, it
205   // will be only partially populated as a result, and an error will be logged.
206   bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
207 
208   bool SupportsBlas() const override;
209 
210   blas::BlasSupport* CreateBlas() override;
211 
212   bool SupportsFft() const override;
213 
214   fft::FftSupport* CreateFft() override;
215 
216   bool SupportsRng() const override;
217 
218   rng::RngSupport* CreateRng() override;
219 
220   bool SupportsDnn() const override;
221 
222   dnn::DnnSupport* CreateDnn() override;
223 
224   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
225       override;
226 
227   std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
228       override;
229 
230   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
231 
232   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
233 
234   void* GpuContextHack() override;
235 
236   GpuContext* gpu_context();
237 
238  private:
239   // Attempts to find a more specific version of the file indicated by
240   // filename by looking for compute-capability-specific suffixed versions; i.e.
241   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
242   // we're on a compute capability 3.0 machine.
243   // (supported on CUDA only)
244   bool FindOnDiskForComputeCapability(absl::string_view filename,
245                                       absl::string_view canonical_suffix,
246                                       string* found_filename) const;
247 
248   // Attempts to find a more specific version of the file indicated by
249   // filename by looking for AMDGPU ISA-specific suffixed versions.
250   // (supported on ROCm only)
251 
252   bool FindOnDiskForISAVersion(absl::string_view filename,
253                                absl::string_view canonical_suffix,
254                                string* found_filename) const;
255 
256   // Host callback landing routine invoked by CUDA.
257   // data: User-provided callback provided to HostCallback() above, captured
258   //       as a std::function<void()>. Allocated/initialized inside
259   //       HostCallback() and owned and deleted by this call.
260   static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
261                                    void* data);
262 
263   // Collects metadata for the specified kernel.
264   bool GetKernelMetadata(GpuKernel* cuda_kernel,
265                          KernelMetadata* kernel_metadata);
266 
267   // Prints to VLOG(2) information about the kernel's occupancy and how it might
268   // be improved.
269   void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
270                          const BlockDim& block_dims);
271 
272   // (supported on CUDA only)
273   bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
274       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
275 
276   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
277   // (supported on CUDA only)
278   bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
279       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
280 
281   // (supported on ROCm only)
282   bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
283       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
284 
285   bool UnloadGpuBinary(const void* gpu_binary)
286       EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
287 
288   // Guards the on-disk-module mapping.
289   mutex disk_modules_mu_;
290 
291   // Mapping from filename to GPUModuleHandle, if it was already retrieved.
292   // Multiple GPUFunctionHandle are usually obtained from a single
293   // GPUModuleHandle so we attempt to hit in this mapping first, before
294   // retrieving it.
295   std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
296 
297   // Guards the in-memory-module mapping.
298   mutex in_memory_modules_mu_;
299 
300   std::map<const char*, GpuModuleHandle> in_memory_modules_
301       GUARDED_BY(in_memory_modules_mu_);
302 
303   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
304   std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
305       GUARDED_BY(in_memory_modules_mu_);
306   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
307   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
308       gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
309 
310   // Guards the launched kernel set.
311   mutex launched_kernels_mu_;
312 
313   // Keeps track of the set of launched kernels. Currently used to suppress the
314   // occupancy check on subsequent launches.
315   std::set<GpuFunctionHandle> launched_kernels_
316       GUARDED_BY(launched_kernels_mu_);
317 
318   // Handle for the CUDA device being operated on. Immutable
319   // post-initialization.
320   GpuDeviceHandle device_;
321 
322   // Handle for session with the library/driver. Immutable post-initialization.
323   GpuContext* context_;
324 
325   // The device ordinal value that this executor was initialized with; recorded
326   // for use in getting device metadata. Immutable post-initialization.
327   int device_ordinal_;
328 
329   // The major verion of the compute capability for device_.
330   int cc_major_;
331 
332   // The minor verion of the compute capability for device_.
333   int cc_minor_;
334 
335   // GPU ISA version for device_.
336   int version_;
337 
338   // The plugin configuration associated with this instance.
339   PluginConfig plugin_config_;
340 
341   SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
342 };
343 
344 }  // namespace gpu
345 }  // namespace stream_executor
346 
347 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
348