1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
17 
18 #if defined(__APPLE__)
19 #include <mach-o/dyld.h>
20 #endif
21 #if defined(PLATFORM_WINDOWS)
22 #include <windows.h>
23 #define PATH_MAX MAX_PATH
24 #else
25 #include <unistd.h>
26 #endif
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
30 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
31 #include "tensorflow/stream_executor/cuda/cuda_event.h"
32 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
33 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
34 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
35 #include "tensorflow/stream_executor/kernel_cache_config.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/initialize.h"
39 #include "tensorflow/stream_executor/lib/mathutil.h"
40 #include "tensorflow/stream_executor/lib/numbers.h"
41 #include "tensorflow/stream_executor/lib/path.h"
42 #include "tensorflow/stream_executor/lib/process_state.h"
43 #include "tensorflow/stream_executor/lib/ptr_util.h"
44 #include "tensorflow/stream_executor/lib/statusor.h"
45 #include "tensorflow/stream_executor/lib/str_util.h"
46 #include "tensorflow/stream_executor/lib/stringprintf.h"
47 #include "tensorflow/stream_executor/platform.h"
48 #include "tensorflow/stream_executor/platform/logging.h"
49 #include "tensorflow/stream_executor/platform/port.h"
50 #include "tensorflow/stream_executor/plugin_registry.h"
51 #include "tensorflow/stream_executor/stream.h"
52 #include "tensorflow/stream_executor/stream_executor_internal.h"
53 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
54 #include "tensorflow/stream_executor/timer.h"
55 
56 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
57 // always unwanted. This happens on Windows that defines such a macro.
58 #undef ERROR
59 
60 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
61 #error \
62     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
63 #endif
64 
65 #ifdef __CUDA_RUNTIME_H__
66 #error \
67     "CUDA runtime being included into CUDA GPU executor; should be driver only."
68 #endif
69 
70 extern bool FLAGS_check_gpu_leaks;
71 bool FLAGS_prefer_cubin_to_ptx = true;
72 
73 namespace stream_executor {
74 namespace gpu {
75 
76 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
77 // It has been observed that loading both PTX and cubins into the driver library
78 // can cause it to crash, but loading only CUBINs avoids those crashes;
79 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
80 // PTX code.
81 //
82 // As this is an implementation-detail workaround, the usage is to declare this
83 // variable with extern linkage and populate it from another translation unit.
84 std::function<string(const string &)> g_cubinate;
85 
AsGpuEvent(Event * event)86 static GpuEvent* AsGpuEvent(Event* event) {
87   DCHECK(event != nullptr);
88   return static_cast<GpuEvent*>(event->implementation());
89 }
90 
91 // Given a platform-independent timer datatype, returns the internal CUDA
92 // platform implementation pointer.
AsGpuTimer(Timer * timer)93 static GpuTimer* AsGpuTimer(Timer* timer) {
94   DCHECK(timer != nullptr);
95   return static_cast<GpuTimer*>(timer->implementation());
96 }
97 
98 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
99 // for passing directly to libcuda APIs.
100 //
101 // N.B. we must lose constness in order to pass a suitable type to the existing
102 // libcuda APIs, so the caller should take care to only pass the result of const
103 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)104 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
105   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
106 }
107 
108 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)109 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
110   return AsCudaDevicePtr(*gpu_mem);
111 }
112 
ExtractGpuContext(GpuExecutor * cuda_exec)113 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
114   CHECK(cuda_exec != nullptr);
115   return cuda_exec->gpu_context();
116 }
117 
ExtractGpuExecutor(StreamExecutor * stream_exec)118 GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
119   return static_cast<GpuExecutor*>(stream_exec->implementation());
120 }
121 
~GpuExecutor()122 GpuExecutor::~GpuExecutor() {
123   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
124   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
125   if (context_ != nullptr) {
126     GpuDriver::DestroyContext(context_);
127   }
128 }
129 
Init(int device_ordinal,DeviceOptions device_options)130 port::Status GpuExecutor::Init(int device_ordinal,
131                                DeviceOptions device_options) {
132   device_ordinal_ = device_ordinal;
133 
134   auto status = GpuDriver::Init();
135   if (!status.ok()) {
136     return status;
137   }
138 
139   status = GpuDriver::GetDevice(device_ordinal_, &device_);
140   if (!status.ok()) {
141     return status;
142   }
143 
144   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
145                                     &context_);
146   if (!status.ok()) {
147     return status;
148   }
149 
150   return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
151 }
152 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const153 bool GpuExecutor::FindOnDiskForComputeCapability(
154     absl::string_view filename, absl::string_view canonical_suffix,
155     string* found_filename) const {
156   if (cc_major_ == 0 && cc_minor_ == 0) {
157     return false;
158   }
159 
160   string cc_specific =
161       absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
162   if (port::FileExists(cc_specific).ok()) {
163     VLOG(2) << "found compute-capability-specific file, using that: "
164             << cc_specific;
165     *found_filename = cc_specific;
166     return true;
167   }
168 
169   VLOG(2) << "could not find compute-capability specific file at: "
170           << cc_specific;
171   if (port::FileExists(string(filename)).ok()) {
172     *found_filename = string(filename);
173     return true;
174   }
175 
176   return false;
177 }
178 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const179 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
180                                           absl::string_view canonical_suffix,
181                                           string* found_filename) const {
182   LOG(ERROR)
183       << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
184   return false;
185 }
186 // Returns the path to the running executable.
187 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
188 // Arg: strip_exe: if true, remove the name of the executable itself from the
189 //                 returned string. Example: calling this from /usr/bin/foo
190 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)191 static string GetBinaryDir(bool strip_exe) {
192   char exe_path[PATH_MAX] = {0};
193 #if defined(__APPLE__)
194   uint32_t buffer_size = 0U;
195   _NSGetExecutablePath(nullptr, &buffer_size);
196   char unresolved_path[buffer_size];
197   _NSGetExecutablePath(unresolved_path, &buffer_size);
198   CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
199 #else
200 #if defined(PLATFORM_WINDOWS)
201   HMODULE hModule = GetModuleHandle(NULL);
202   GetModuleFileName(hModule, exe_path, MAX_PATH);
203 #else
204   CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
205 #endif
206 #endif
207   // Make sure it's null-terminated:
208   exe_path[sizeof(exe_path) - 1] = 0;
209 
210   if (strip_exe) {
211     // The exe is the last component of the path, so remove one component.
212     string ret = exe_path;
213     std::vector<string> components = port::Split(exe_path, '/');
214     components.pop_back();
215     return port::Join(components, "/");
216   }
217   return exe_path;
218 }
219 
LoadModuleFromCuBin(const char * cubin,CUmodule * module)220 bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
221   uint64_t module_refcount;
222   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
223 
224   if (*module == nullptr) {
225     auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
226     if (!load_status.ok()) {
227       LOG(ERROR) << "failed to load CUBIN: " << load_status;
228       return false;
229     }
230     module_refcount = 1;
231     VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
232             << " as module " << *module;
233   } else {
234     ++module_refcount;
235     VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
236             << " is already loaded as module " << *module;
237   }
238   gpu_binary_to_module_[cubin] = {*module, module_refcount};
239   return true;
240 }
241 
LoadModuleFromPtx(const char * ptx,CUmodule * module)242 bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
243   uint64_t module_refcount;
244   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
245 
246   if (*module == nullptr) {
247     if (!GpuDriver::LoadPtx(context_, ptx, module)) {
248       return false;
249     }
250     VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
251             << *module;
252     module_refcount = 1;
253   } else {
254     ++module_refcount;
255     VLOG(3) << "PTX " << static_cast<const void *>(ptx)
256             << " is already loaded as module " << module;
257   }
258   gpu_binary_to_module_[ptx] = {*module, module_refcount};
259   return true;
260 }
261 
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)262 bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
263   LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
264   return false;
265 }
266 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)267 bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
268                             KernelBase* kernel) {
269   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
270   CUmodule module;
271   const string *kernelname;
272 
273   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
274 
275   if (spec.has_cuda_cubin_in_memory()) {
276     mutex_lock lock{in_memory_modules_mu_};
277     kernelname = &spec.cuda_cubin_in_memory().kernelname();
278     const char *cubin = spec.cuda_cubin_in_memory().bytes();
279     if (!LoadModuleFromCuBin(cubin, &module)) {
280       return false;
281     }
282     kernel_to_gpu_binary_[kernel] = cubin;
283   } else if (spec.has_cuda_ptx_in_memory()) {
284     kernelname = &spec.cuda_ptx_in_memory().kernelname();
285 
286     if (cc_major_ == 0 && cc_minor_ == 0) {
287       return false;
288     }
289 
290     const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
291     if (ptx == nullptr) {
292       ptx = spec.cuda_ptx_in_memory().default_text();
293     }
294     if (ptx == nullptr) {
295       LOG(FATAL) << "loader spec has no ptx for kernel " << *kernelname;
296       return false;
297     }
298 
299     mutex_lock lock{in_memory_modules_mu_};
300     if (!LoadModuleFromPtx(ptx, &module)) {
301       return false;
302     }
303     kernel_to_gpu_binary_[kernel] = ptx;
304   } else {
305     LOG(WARNING) << "no method of loading CUDA kernel provided";
306     return false;
307   }
308   VLOG(2) << "getting function " << *kernelname << " from module " << module;
309   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
310                                     cuda_kernel->gpu_function_ptr())) {
311     return false;
312   }
313 
314   // We have to trust the kernel loader spec arity because there doesn't appear
315   // to be a way to reflect on the number of expected arguments w/the CUDA API.
316   cuda_kernel->set_arity(spec.arity());
317 
318   KernelMetadata kernel_metadata;
319   if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) {
320     LOG(WARNING) << "unable to get metadata for kernel " << *kernelname;
321   }
322   kernel->set_metadata(kernel_metadata);
323   kernel->set_name(*kernelname);
324   return true;
325 }
326 
UnloadGpuBinary(const void * gpu_binary)327 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
328   auto module_it = gpu_binary_to_module_.find(gpu_binary);
329   if (gpu_binary_to_module_.end() == module_it) {
330     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
331     return false;
332   }
333   auto &module = module_it->second.first;
334   auto &refcount = module_it->second.second;
335   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
336   if (--refcount == 0) {
337     VLOG(3) << "Unloading CUDA module " << module;
338     GpuDriver::UnloadModule(context_, module);
339     gpu_binary_to_module_.erase(module_it);
340   }
341   return true;
342 }
343 
UnloadKernel(const KernelBase * kernel)344 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
345   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
346 
347   mutex_lock lock{in_memory_modules_mu_};
348   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
349   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
350     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
351             << " has never been loaded.";
352     return;  // We've never seen this kernel.
353   }
354   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
355           << " has loaded GPU code " << gpu_binary_it->second;
356   UnloadGpuBinary(gpu_binary_it->second);
357   kernel_to_gpu_binary_.erase(gpu_binary_it);
358 }
359 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)360 bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
361                              ModuleHandle* module_handle) {
362   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
363   // ModuleHandle::id().
364   CUmodule cu_module;
365   if (spec.has_cuda_cubin_in_memory()) {
366     mutex_lock lock{in_memory_modules_mu_};
367     if (!LoadModuleFromCuBin(
368             reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
369             &cu_module)) {
370       return false;
371     }
372     *module_handle = ModuleHandle(const_cast<void *>(
373         static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
374     return true;
375   } else if (spec.has_cuda_ptx_in_memory()) {
376     if (cc_major_ == 0 && cc_minor_ == 0) {
377       return false;
378     }
379 
380     if (!spec.cuda_ptx_in_memory()) {
381       return false;
382     }
383 
384     mutex_lock lock{in_memory_modules_mu_};
385     if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
386       return false;
387     }
388     *module_handle = ModuleHandle(const_cast<void *>(
389         static_cast<const void *>(spec.cuda_ptx_in_memory())));
390     return true;
391   }
392   LOG(WARNING) << "no method of loading CUDA module provided";
393   return false;
394 }
395 
UnloadModule(ModuleHandle module_handle)396 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
397   const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
398   mutex_lock lock{in_memory_modules_mu_};
399   return UnloadGpuBinary(gpu_binary);
400 }
401 
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)402 bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
403                                     KernelMetadata* kernel_metadata) {
404   int value;
405   if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
406                                    *cuda_kernel->gpu_function_ptr(), &value)) {
407     return false;
408   }
409   kernel_metadata->set_registers_per_thread(value);
410 
411   if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
412                                    *cuda_kernel->gpu_function_ptr(), &value)) {
413     return false;
414   }
415   kernel_metadata->set_shared_memory_bytes(value);
416 
417   return true;
418 }
419 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)420 bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
421                          const BlockDim& block_dims, const KernelBase& kernel,
422                          const KernelArgsArrayBase& args) {
423   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
424   CUstream custream = AsGpuStreamValue(stream);
425   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
426   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
427 
428   // Only perform/print the occupancy check once.  Even just checking to see
429   // whether we've done an occupancy check on this kernel before isn't free
430   // (because we have to synchronize), so we only do this at -v 2+.
431   if (VLOG_IS_ON(2)) {
432     mutex_lock lock(launched_kernels_mu_);
433     if (!launched_kernels_.count(cufunc)) {
434       VlogOccupancyInfo(kernel, thread_dims, block_dims);
435       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
436       // expose a kernel/module deallocation method.
437       launched_kernels_.insert(cufunc);
438     }
439   }
440 
441   if (cuda_kernel->GetPreferredCacheConfig() !=
442       KernelCacheConfig::kNoPreference) {
443     GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
444   }
445 
446   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
447 
448   if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
449                                block_dims.z, thread_dims.x, thread_dims.y,
450                                thread_dims.z, args.number_of_shared_bytes(),
451                                custream, kernel_params,
452                                nullptr /* = extra */)) {
453     LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
454                << args.number_of_arguments()
455                << " args; thread dim: " << thread_dims.ToString()
456                << "; block dim: " << block_dims.ToString();
457     return false;
458   }
459 
460   return true;
461 }
462 
463 // This is a non-essential operation; if there's a failure, proceed without
464 // logging an error. It's nearly certain that in case of failures, we'd never
465 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)466 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
467                                     const ThreadDim& thread_dims,
468                                     const BlockDim& block_dims) {
469   VLOG(2) << "Computing kernel occupancy for kernel "
470           << kernel.demangled_name();
471   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
472           << ", " << thread_dims.z << ")";
473 
474   int regs_per_thread;
475   if (!kernel.metadata().registers_per_thread(&regs_per_thread)) {
476     return;
477   }
478 
479   int smem_per_block;
480   if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
481     return;
482   }
483 
484   const DeviceDescription &device_description =
485       kernel.parent()->GetDeviceDescription();
486 
487   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
488   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
489 
490   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
491                                          smem_per_block, thread_dims, cufunc);
492   VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
493 
494   int suggested_threads =
495       CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
496                        smem_per_block, thread_dims, cufunc);
497   if (suggested_threads != 0) {
498     VLOG(2) << "The cuda occupancy calculator recommends using "
499             << suggested_threads
500             << " threads per block to achieve an occupancy of " << blocks_per_sm
501             << " blocks per SM.";
502   }
503 }
504 
505 // Compute and return maximum blocks per core (occupancy) based on the
506 // device description, some kernel characteristics and the number of threads per
507 // block.  If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)508 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
509                                     uint64 registers_per_thread,
510                                     uint64 shared_memory_per_block,
511                                     const ThreadDim& thread_dims,
512                                     CUfunction func) {
513   int suggested_blocks = 0;
514   int suggested_threads = 0;
515   CUresult err = cuOccupancyMaxPotentialBlockSize(
516       &suggested_blocks, &suggested_threads, func, nullptr,
517       shared_memory_per_block, 0);
518   CHECK_EQ(err, CUDA_SUCCESS);
519   return suggested_blocks;
520 }
521 
522 // Compute and return the suggested thread count to achieve ideal occupancy.
523 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)524 int GpuExecutor::CompareOccupancy(int* initial_blocks,
525                                   const DeviceDescription& device_description,
526                                   uint64 registers_per_thread,
527                                   uint64 shared_memory_per_block,
528                                   const ThreadDim& thread_dims,
529                                   CUfunction func) {
530   int suggested_blocks = 0;
531   int suggested_threads = 0;
532   CUresult err = cuOccupancyMaxPotentialBlockSize(
533       &suggested_blocks, &suggested_threads, func, nullptr,
534       shared_memory_per_block, 0);
535   CHECK_EQ(err, CUDA_SUCCESS);
536   if (suggested_blocks > *initial_blocks) {
537     *initial_blocks = suggested_blocks;
538     return suggested_threads;
539   } else {
540     return 0;
541   }
542 }
543 
Allocate(uint64 size)544 void* GpuExecutor::Allocate(uint64 size) {
545   return GpuDriver::DeviceAllocate(context_, size);
546 }
547 
AllocateSubBuffer(DeviceMemoryBase * mem,uint64 offset_bytes,uint64 size_bytes)548 void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
549                                      uint64 size_bytes) {
550   // offset and size are in bytes, so char* works as the pointer type.
551   return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
552 }
553 
Deallocate(DeviceMemoryBase * mem)554 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
555   // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
556   if (!mem->is_sub_buffer()) {
557     GpuDriver::DeviceDeallocate(context_, mem->opaque());
558   }
559 }
560 
HostMemoryRegister(void * location,uint64 size)561 bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
562   if (location == nullptr || size == 0) {
563     LOG(WARNING) << "attempting to register null or zero-sized memory: "
564                  << location << "; size " << size;
565   }
566   VLOG(2) << "registering " << location << " size " << size;
567   return GpuDriver::HostRegister(context_, location, size);
568 }
569 
HostMemoryUnregister(void * location)570 bool GpuExecutor::HostMemoryUnregister(void* location) {
571   VLOG(2) << "unregistering " << location;
572   return GpuDriver::HostUnregister(context_, location);
573 }
574 
SynchronizeAllActivity()575 bool GpuExecutor::SynchronizeAllActivity() {
576   return GpuDriver::SynchronizeContext(context_);
577 }
578 
SynchronousMemZero(DeviceMemoryBase * location,uint64 size)579 bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
580   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
581       size % 4 == 0) {
582     return GpuDriver::SynchronousMemsetUint32(
583         context_, AsCudaDevicePtr(location), 0x0, size / 4);
584   }
585   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
586                                            0x0, size);
587 }
588 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64 size)589 bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
590                                     uint64 size) {
591   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
592       size % 4 == 0) {
593     // cudaMemset reinterprets "value" as a uint8.
594     uint8 byte_value = static_cast<uint8>(value);
595     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
596                      (byte_value << 8) | byte_value;
597     return GpuDriver::SynchronousMemsetUint32(
598         context_, AsCudaDevicePtr(location), pattern, size / 4);
599   }
600   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
601                                            value, size);
602 }
603 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)604 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
605                                             const void* host_src, uint64 size) {
606   return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
607                                          host_src, size);
608 }
609 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)610 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
611                                             const DeviceMemoryBase& gpu_src,
612                                             uint64 size) {
613   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
614                                          AsCudaDevicePtr(gpu_src), size);
615 }
616 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)617 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
618     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
619   return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
620                                          AsCudaDevicePtr(gpu_src), size);
621 }
622 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64 size)623 bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
624                           uint64 size) {
625   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
626       size % 4 == 0) {
627     return Memset32(stream, location, 0x0, size);
628   } else {
629     return Memset(stream, location, 0x0, size);
630   }
631 }
632 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64 size)633 bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
634                          uint8 pattern, uint64 size) {
635   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
636           << " at location " << location << " with size " << size
637           << " and pattern " << std::hex << pattern;
638   return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
639                                             pattern, size,
640                                             AsGpuStreamValue(stream));
641 }
642 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64 size)643 bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
644                            uint32 pattern, uint64 size) {
645   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
646           << " at location " << location << " with size " << size
647           << " and pattern " << std::hex << pattern;
648   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
649         size % 4 == 0);
650   return GpuDriver::AsynchronousMemsetUint32(
651       context_, AsCudaDevicePtr(location), pattern, size / 4,
652       AsGpuStreamValue(stream));
653 }
654 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64 size)655 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
656                          const DeviceMemoryBase& gpu_src, uint64 size) {
657   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
658                                           AsCudaDevicePtr(gpu_src), size,
659                                           AsGpuStreamValue(stream));
660 }
661 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64 size)662 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
663                          const void* host_src, uint64 size) {
664   return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
665                                           host_src, size,
666                                           AsGpuStreamValue(stream));
667 }
668 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64 size)669 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
670                                        DeviceMemoryBase* gpu_dst,
671                                        const DeviceMemoryBase& gpu_src,
672                                        uint64 size) {
673   return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
674                                           AsCudaDevicePtr(gpu_src), size,
675                                           AsGpuStreamValue(stream));
676 }
677 
HostCallback(Stream * stream,std::function<port::Status ()> callback)678 bool GpuExecutor::HostCallback(Stream* stream,
679                                std::function<port::Status()> callback) {
680   auto callback_ptr = new std::function<void()>([callback]() {
681     port::Status s = callback();
682     if (!s.ok()) {
683       LOG(WARNING) << "Host callback failed: " << s;
684     }
685   });
686   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
687                                       InternalHostCallback, callback_ptr);
688 }
689 
InternalHostCallback(CUstream stream,CUresult status,void * data)690 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
691                                                     CUresult status,
692                                                     void* data) {
693   std::function<void()> *callback =
694       reinterpret_cast<std::function<void()> *>(data);
695   (*callback)();
696   delete callback;
697 }
698 
AllocateEvent(Event * event)699 port::Status GpuExecutor::AllocateEvent(Event* event) {
700   return AsGpuEvent(event)->Init();
701 }
702 
DeallocateEvent(Event * event)703 port::Status GpuExecutor::DeallocateEvent(Event* event) {
704   return AsGpuEvent(event)->Destroy();
705 }
706 
RecordEvent(Stream * stream,Event * event)707 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
708   return AsGpuEvent(event)->Record(AsGpuStream(stream));
709 }
710 
WaitForEvent(Stream * stream,Event * event)711 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
712   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
713                                    AsGpuEvent(event)->gpu_event())) {
714     return port::Status::OK();
715   } else {
716     return port::Status(
717         port::error::INTERNAL,
718         port::Printf("error recording waiting for CUDA event on stream %p",
719                      stream));
720   }
721 }
722 
PollForEventStatus(Event * event)723 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
724   return AsGpuEvent(event)->PollForStatus();
725 }
726 
AllocateStream(Stream * stream)727 bool GpuExecutor::AllocateStream(Stream* stream) {
728   return AsGpuStream(stream)->Init();
729 }
730 
DeallocateStream(Stream * stream)731 void GpuExecutor::DeallocateStream(Stream* stream) {
732   GpuStream* cuda_stream = AsGpuStream(stream);
733   if (!cuda_stream->IsIdle()) {
734     LOG(ERROR) << "Deallocating stream with pending work";
735   }
736   cuda_stream->Destroy();
737 }
738 
AllocateTimer(Timer * timer)739 bool GpuExecutor::AllocateTimer(Timer* timer) {
740   return AsGpuTimer(timer)->Init();
741 }
742 
DeallocateTimer(Timer * timer)743 void GpuExecutor::DeallocateTimer(Timer* timer) {
744   AsGpuTimer(timer)->Destroy();
745 }
746 
CreateStreamDependency(Stream * dependent,Stream * other)747 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
748   CUevent other_completed_event = *AsGpuStream(other)->completed_event();
749   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
750                                    AsGpuStreamValue(other))
751                 .ok();
752   if (!ok) {
753     LOG(ERROR) << "failed to record completion event; "
754                   "therefore, failed to create inter-stream dependency";
755     return false;
756   }
757 
758   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
759                                       other_completed_event);
760 }
761 
StartTimer(Stream * stream,Timer * timer)762 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
763   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
764 }
765 
StopTimer(Stream * stream,Timer * timer)766 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
767   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
768 }
769 
BlockHostUntilDone(Stream * stream)770 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
771   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
772 }
773 
CreateBlas()774 blas::BlasSupport* GpuExecutor::CreateBlas() {
775   PluginRegistry *registry = PluginRegistry::Instance();
776   port::StatusOr<PluginRegistry::BlasFactory> status =
777       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
778                                                         plugin_config_.blas());
779   if (!status.ok()) {
780     LOG(ERROR) << "Unable to retrieve BLAS factory: "
781                << status.status().error_message();
782     return nullptr;
783   }
784 
785   return status.ValueOrDie()(this);
786 }
787 
CreateDnn()788 dnn::DnnSupport* GpuExecutor::CreateDnn() {
789   PluginRegistry *registry = PluginRegistry::Instance();
790   port::StatusOr<PluginRegistry::DnnFactory> status =
791       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
792                                                        plugin_config_.dnn());
793   if (!status.ok()) {
794     LOG(ERROR) << "Unable to retrieve DNN factory: "
795                << status.status().error_message();
796     return nullptr;
797   }
798 
799   return status.ValueOrDie()(this);
800 }
801 
CreateFft()802 fft::FftSupport* GpuExecutor::CreateFft() {
803   PluginRegistry *registry = PluginRegistry::Instance();
804   port::StatusOr<PluginRegistry::FftFactory> status =
805       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
806                                                        plugin_config_.fft());
807   if (!status.ok()) {
808     LOG(ERROR) << "Unable to retrieve FFT factory: "
809                << status.status().error_message();
810     return nullptr;
811   }
812 
813   return status.ValueOrDie()(this);
814 }
815 
CreateRng()816 rng::RngSupport* GpuExecutor::CreateRng() {
817   PluginRegistry *registry = PluginRegistry::Instance();
818   port::StatusOr<PluginRegistry::RngFactory> status =
819       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
820                                                        plugin_config_.rng());
821   if (!status.ok()) {
822     LOG(ERROR) << "Unable to retrieve RNG factory: "
823                << status.status().error_message();
824     return nullptr;
825   }
826 
827   return status.ValueOrDie()(this);
828 }
829 
830 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const831 bool GpuExecutor::SupportsDnn() const { return true; }
832 
CanEnablePeerAccessTo(StreamExecutorInterface * other)833 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
834   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
835   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
836 }
837 
EnablePeerAccessTo(StreamExecutorInterface * other)838 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
839   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
840   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
841 }
842 
GetDeviceSharedMemoryConfig()843 SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
844   port::StatusOr<CUsharedconfig> cuda_config =
845       GpuDriver::ContextGetSharedMemConfig(context_);
846   if (!cuda_config.ok()) {
847     // Don't log; the failed call will log necessary output.
848     return SharedMemoryConfig::kDefault;
849   }
850 
851   switch (cuda_config.ValueOrDie()) {
852     case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
853       return SharedMemoryConfig::kDefault;
854     case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
855       return SharedMemoryConfig::kFourByte;
856     case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
857       return SharedMemoryConfig::kEightByte;
858     default:
859       LOG(FATAL) << "Invalid shared memory configuration returned: "
860                  << cuda_config.ValueOrDie();
861   }
862 }
863 
SetDeviceSharedMemoryConfig(SharedMemoryConfig config)864 port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
865     SharedMemoryConfig config) {
866   CUsharedconfig cuda_config;
867   switch (config) {
868     case SharedMemoryConfig::kDefault:
869       cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
870       break;
871     case SharedMemoryConfig::kFourByte:
872       cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
873       break;
874     case SharedMemoryConfig::kEightByte:
875       cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
876       break;
877     default:
878       LOG(FATAL) << "Invalid shared memory configuration specified: "
879                  << static_cast<int>(config);
880   }
881   return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
882 }
883 
DeviceMemoryUsage(int64 * free,int64 * total) const884 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
885   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
886 }
887 
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)888 bool GpuExecutor::GetSymbol(const string& symbol_name,
889                             ModuleHandle module_handle, void** mem,
890                             size_t* bytes) {
891   auto lookup_in_module = [&](CUmodule module) {
892     CHECK(module != nullptr);
893     return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
894                                       reinterpret_cast<CUdeviceptr*>(mem),
895                                       bytes);
896   };
897 
898   {  // give limited scope to mutex_lock
899     mutex_lock lock{in_memory_modules_mu_};
900     if (static_cast<bool>(module_handle)) {
901       auto it = gpu_binary_to_module_.find(module_handle.id());
902       CHECK(it != gpu_binary_to_module_.end());
903       return lookup_in_module(it->second.first);
904     }
905 
906     for (auto &it : gpu_binary_to_module_) {
907       if (lookup_in_module(it.second.first)) {
908         return true;
909       }
910     }
911   }
912 
913   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
914   return false;
915 }
916 
FillBlockDimLimit(BlockDim * block_dim_limit) const917 bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
918   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
919   // we use BlockDims to express the dimensions of blocks within a grid
920   // (as opposed to ThreadDim which expresses the dimensions of threads
921   // within a block).
922   int x, y, z;
923   if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
924     return false;
925   }
926 
927   block_dim_limit->x = x;
928   block_dim_limit->y = y;
929   block_dim_limit->z = z;
930   return true;
931 }
932 
SupportsBlas() const933 bool GpuExecutor::SupportsBlas() const { return true; }
934 
SupportsFft() const935 bool GpuExecutor::SupportsFft() const { return true; }
936 
SupportsRng() const937 bool GpuExecutor::SupportsRng() const { return true; }
938 
939 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()940 GpuExecutor::CreateEventImplementation() {
941   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
942 }
943 
944 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()945 GpuExecutor::CreateKernelImplementation() {
946   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
947 }
948 
949 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()950 GpuExecutor::GetStreamImplementation() {
951   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
952 }
953 
954 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()955 GpuExecutor::GetTimerImplementation() {
956   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
957 }
958 
GpuContextHack()959 void* GpuExecutor::GpuContextHack() { return context_; }
960 
gpu_context()961 GpuContext* GpuExecutor::gpu_context() { return context_; }
962 
963 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
964 // of SysFS. Returns -1 if it cannot.
965 //
966 // For anything more complicated/prod-focused than this, you'll likely want to
967 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)968 static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
969 #if defined(__APPLE__)
970   LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
971   return 0;
972 #elif defined(PLATFORM_WINDOWS)
973   // Windows support for NUMA is not currently implemented. Return node 0.
974   return 0;
975 #elif defined(__aarch64__)
976   LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
977   return 0;
978 #else
979   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
980   static const int kUnknownNumaNode = -1;
981 
982   if (pci_bus_id.empty()) {
983     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
984     return kUnknownNumaNode;
985   }
986 
987   string filename =
988       port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str());
989 
990   // We have to use fopen/fread here so that the device properties can be
991   // populated before InitGoogle procedure has been completed (at which point we
992   // could use the file::* utilities).
993   FILE *file = fopen(filename.c_str(), "r");
994   if (file == nullptr) {
995     LOG(ERROR) << "could not open file to read NUMA node: " << filename
996                << "\nYour kernel may have been built without NUMA support.";
997     return kUnknownNumaNode;
998   }
999 
1000   string content;
1001   char buf[32];
1002   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
1003   buf[did_read] = '\0';
1004   content = buf;
1005 
1006   int32 value;
1007   if (port::safe_strto32(content, &value)) {
1008     if (value < 0) {  // See http://b/18228951 for details on this path.
1009       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
1010                 << value << "), but there must be at least one NUMA node"
1011                             ", so returning NUMA node zero";
1012       fclose(file);
1013       return 0;
1014     }
1015     fclose(file);
1016     return value;
1017   }
1018 
1019   LOG(WARNING)
1020       << "could not convert SysFS file contents to integral NUMA node value: "
1021       << content;
1022 
1023   fclose(file);
1024   return kUnknownNumaNode;
1025 #endif
1026 }
1027 
PopulateDeviceDescription() const1028 DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
1029   internal::DeviceDescriptionBuilder builder;
1030 
1031   {
1032     int driver_version = 0;
1033     (void)GpuDriver::GetDriverVersion(&driver_version);
1034     string augmented_driver_version = port::Printf(
1035         "%d (%s)", driver_version,
1036         cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
1037             .c_str());
1038     builder.set_driver_version(augmented_driver_version);
1039   }
1040 
1041   {
1042     string pci_bus_id = GpuDriver::GetPCIBusID(device_);
1043 
1044     // Lower the hex characters to match sysfs.
1045     pci_bus_id = port::Lowercase(pci_bus_id);
1046     builder.set_pci_bus_id(pci_bus_id);
1047 
1048     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
1049     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
1050     builder.set_numa_node(numa_node);
1051   }
1052 
1053   {
1054     builder.set_threads_per_block_limit(
1055         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1056                                       device_)
1057             .ValueOrDie());
1058 
1059     ThreadDim thread_dim_limit;
1060     thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1061                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
1062                              .ValueOrDie();
1063     thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1064                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
1065                              .ValueOrDie();
1066     thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1067                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
1068                              .ValueOrDie();
1069     builder.set_thread_dim_limit(thread_dim_limit);
1070 
1071     int clock_rate =
1072         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
1073             .ValueOrDie();
1074     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1075   }
1076 
1077   {
1078     bool ecc_enabled = false;
1079     (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
1080     builder.set_ecc_enabled(ecc_enabled);
1081   }
1082 
1083   {
1084     uint64 device_memory_size = -1;
1085     (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
1086     builder.set_device_memory_size(device_memory_size);
1087   }
1088 
1089   port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1090       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
1091   port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1092       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
1093   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1094     // Times 2 because HBM is DDR memory; it gets two data bits per each data
1095     // lane.
1096     builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1097                                  1000 *
1098                                  int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1099   }
1100 
1101   {
1102     BlockDim block_dim_limit;
1103     FillBlockDimLimit(&block_dim_limit);
1104     builder.set_block_dim_limit(block_dim_limit);
1105   }
1106 
1107   {
1108     string device_name;
1109     (void)GpuDriver::GetDeviceName(device_, &device_name);
1110     builder.set_name(device_name);
1111   }
1112 
1113   builder.set_platform_version(
1114       absl::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
1115 
1116   // TODO(leary) should be a way to query this from the driver, but this is
1117   // unlikely to change for us any time soon.
1118   builder.set_device_address_bits(64);
1119 
1120   builder.set_device_vendor("NVIDIA Corporation");
1121   builder.set_cuda_compute_capability(cc_major_, cc_minor_);
1122   builder.set_shared_memory_per_core(
1123       GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
1124   builder.set_shared_memory_per_block(
1125       GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
1126   builder.set_core_count(
1127       GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
1128   builder.set_threads_per_core_limit(
1129       GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
1130   builder.set_registers_per_block_limit(
1131       GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
1132   builder.set_threads_per_warp(
1133       GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
1134   builder.set_registers_per_core_limit(
1135       GpuDriver::GetDeviceAttribute(
1136           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
1137           .ValueOrDie());
1138 
1139   // We are loading a dummy ptx kernel to set the device description's
1140   // blocks_per_core_limit by calling the CUDA occupancy calculator.  This
1141   // value is currently required XLA GPU's CalculateLaunchDimensions()
1142   const char* blank_ptx = R"(
1143 .version 6.0
1144 .target sm_30
1145 .address_size 64
1146 
1147         // .globl       testkernel
1148 .visible .entry testkernel()
1149 {
1150         ret;
1151 })";
1152   const char* kernel_name = "testkernel";
1153 
1154   CUmodule blank_module;
1155   CUfunction blank_function;
1156   int bpc = -1;
1157   bool ptx_success =
1158       cuda::CUDADriver::LoadPtx(context_, blank_ptx, &blank_module);
1159   if (ptx_success) {
1160     ptx_success = cuda::CUDADriver::GetModuleFunction(
1161         context_, blank_module, kernel_name, &blank_function);
1162     if (ptx_success) {
1163       CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
1164           &bpc, blank_function, 1, 1);
1165       if (result != CUDA_SUCCESS) {
1166         bpc = -1;
1167         ptx_success = false;
1168       }
1169     }
1170     cuda::CUDADriver::UnloadModule(context_, blank_module);
1171   }
1172   if (!ptx_success) {
1173     LOG(ERROR) << "Failed to calculate max blocks per SM using dummy kernel.";
1174   }
1175   builder.set_blocks_per_core_limit(bpc);
1176 
1177   auto built = builder.Build();
1178   return built.release();
1179 }
1180 
1181 }  // namespace gpu
1182 
initialize_cuda_gpu_executor()1183 void initialize_cuda_gpu_executor() {
1184   *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
1185     return new gpu::GpuExecutor{config};
1186   };
1187 }
1188 
1189 }  // namespace stream_executor
1190 
1191 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
1192   stream_executor::initialize_cuda_gpu_executor();
1193 });
1194