1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17 
18 #include <stdint.h>
19 #include <stdlib.h>
20 #include <map>
21 #include <set>
22 #include <utility>
23 
24 #include "absl/base/casts.h"
25 #include "absl/container/inlined_vector.h"
26 #include "absl/debugging/leak_check.h"
27 #include "absl/strings/str_cat.h"
28 #include "cuda/include/cuda_runtime_api.h"
29 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
30 #include "tensorflow/stream_executor/lib/env.h"
31 #include "tensorflow/stream_executor/lib/error.h"
32 #include "tensorflow/stream_executor/lib/human_readable.h"
33 #include "tensorflow/stream_executor/lib/notification.h"
34 #include "tensorflow/stream_executor/lib/ptr_util.h"
35 #include "tensorflow/stream_executor/lib/stacktrace.h"
36 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
37 #include "tensorflow/stream_executor/lib/stringprintf.h"
38 #include "tensorflow/stream_executor/lib/threadpool.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/mutex.h"
41 #include "tensorflow/stream_executor/platform/port.h"
42 
43 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
44 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
45 bool FLAGS_gpuexec_cuda_device_0_only = false;
46 
47 // Debugging: on each push and pop of a cuda context, verify the current context
48 // matches the expected one.
49 constexpr bool kVerifyGpuContext = false;
50 
51 namespace stream_executor {
52 namespace gpu {
53 namespace {
54 
55 // Manages the singleton map of contexts that we've created, mapping
56 // from the CUcontext to the GpuContext* that we pass around internally.
57 // This also manages assignment of unique ids to GpuContexts, to allow
58 // for fast comparison of a context against the current context.
59 //
60 // CUDA-runtime-created contexts are avoided, if triple angle
61 // brace launches are required, by using the scoped activations in
62 // gpu/gpu_activation.h.
63 class CreatedContexts {
64  public:
65   // Returns whether context is a member of the live set.
Has(CUcontext context)66   static bool Has(CUcontext context) {
67     tf_shared_lock lock(mu_);
68     return Live()->find(context) != Live()->end();
69   }
70 
71   // Adds context to the live set, or returns it if it's already present.
Add(CUcontext context)72   static GpuContext* Add(CUcontext context) {
73     CHECK(context != nullptr);
74     mutex_lock lock(mu_);
75     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
76     auto it = insert_result.first;
77     if (insert_result.second) {
78       // context was not present in the map.  Add it.
79       it->second = MakeUnique<GpuContext>(context, next_id_++);
80     }
81     return it->second.get();
82   }
83 
84   // Removes context from the live set.
Remove(CUcontext context)85   static void Remove(CUcontext context) {
86     CHECK(context != nullptr);
87     mutex_lock lock(mu_);
88     auto it = Live()->find(context);
89     CHECK(it != Live()->end()) << context;
90     Live()->erase(it);
91   }
92 
93  private:
94   // Returns the live map singleton.
Live()95   static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
96     static auto singleton =
97         new std::map<CUcontext, std::unique_ptr<GpuContext>>;
98     return singleton;
99   }
100 
101   // Lock that guards access-to/mutation-of the live set.
102   static mutex mu_;
103   static int64 next_id_;
104 };
105 
106 /* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
107 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
108 
109 // Formats CUresult to output prettified values into a log stream.
ToString(CUresult result)110 string ToString(CUresult result) {
111   const char* error_name;
112   if (cuGetErrorName(result, &error_name)) {
113     return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
114   }
115   const char* error_string;
116   if (cuGetErrorString(result, &error_string)) {
117     return error_name;
118   }
119   return absl::StrCat(error_name, ": ", error_string);
120 }
121 
122 // Returns the current context and checks that it is in the set of CUDA contexts
123 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
124 // context behind our backs).
CurrentContext()125 CUcontext CurrentContext() {
126   CUcontext current = cuda::CurrentContextOrDie();
127   if (current != nullptr && !CreatedContexts::Has(current)) {
128     LOG(FATAL) << "current context was not created by the StreamExecutor "
129                   "cuda_driver API: "
130                << current
131                << "; a CUDA runtime call "
132                   "was likely performed without using a StreamExecutor context";
133   }
134   return current;
135 }
136 
137 // CUDA driver routines may require a large amount of stack (particularly
138 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
139 // stack-limited threads (such as those spawned by a default-argument
140 // thread::ThreadPool on some platforms), we run certain routines in this pool
141 // and wait for completion.
142 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
InitializeDriverExecutor()143 static port::ThreadPool* InitializeDriverExecutor() {
144   return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
145                               "cuda_driver", 1);
146 }
147 
GetDriverExecutor()148 port::ThreadPool* GetDriverExecutor() {
149   mutex_lock lock(driver_executor_threadpool_mu);
150   static port::ThreadPool* thread_pool = InitializeDriverExecutor();
151   return thread_pool;
152 }
153 
154 }  // namespace
155 
MemorySpaceString(MemorySpace memory_space)156 string MemorySpaceString(MemorySpace memory_space) {
157   switch (memory_space) {
158     case MemorySpace::kHost:
159       return "host";
160     case MemorySpace::kDevice:
161       return "device";
162     default:
163       LOG(FATAL) << "impossible memory space";
164   }
165 }
166 
167 namespace {
168 
169 template <typename PtrT>
PointerIsValid(const PtrT ptr)170 bool PointerIsValid(const PtrT ptr) {
171   // Checks that the pointer is to a location on the device it purports to be.
172   // PtrT is one of CUdeviceptr or void*.  If it's a CUdeviceptr, then
173   // cudaPointerGetAttributes should not fail, and return a memoryType of
174   // cudaMemoryTypeDevice.
175 
176   bool is_host_ptr = !std::is_same<PtrT, CUdeviceptr>::value;
177   cudaPointerAttributes attributes;
178   cudaError_t err =
179       cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
180   // If we failed, reset cuda error status to avoid poisoning cuda streams.
181   if (err != cudaSuccess) cudaGetLastError();
182   bool points_to_host_memory = (err == cudaErrorInvalidValue ||
183                                 attributes.memoryType != cudaMemoryTypeDevice);
184   return (is_host_ptr == points_to_host_memory);
185 }
186 
187 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()188 void SynchronizeOrDie() {
189   auto res = cuCtxSynchronize();
190   if (res != CUDA_SUCCESS) {
191     LOG(FATAL) << "Synchronize found " << ToString(res)
192                << " :: " << port::CurrentStackTrace();
193   }
194 }
195 
196 struct ThreadLocalData {
197   int64 id;
198   GpuContext* context;  // Only valid if id == a known good context.
199   int depth;
200 };
201 
202 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
203 
204 }  // namespace
205 
ScopedActivateContext(GpuContext * cuda_context)206 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
207   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
208 
209   auto* tls = &tls_data.get();
210   tls->depth++;
211   if (tls->id == cuda_context->id()) {
212     if (kVerifyGpuContext) {
213       CHECK_EQ(CurrentContext(), cuda_context->context());
214     }
215     DCHECK_EQ(CurrentContext(), cuda_context->context());
216     return;
217   }
218 
219   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
220           << " to " << cuda_context->id();
221 
222   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
223 
224   // Set the context and update thread local.
225   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
226   tls->id = cuda_context->id();
227   tls->context = cuda_context;
228 }
229 
~ScopedActivateContext()230 ScopedActivateContext::~ScopedActivateContext() {
231   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
232 
233   auto* tls = &tls_data.get();
234 
235   if (kVerifyGpuContext) {
236     // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
237     // possible this could fail in the CurrentContext() call.
238     CHECK_EQ(CurrentContext(),
239              tls->context == nullptr ? nullptr : tls->context->context());
240   }
241 
242   tls->depth--;
243   DCHECK_GE(tls->depth, 0);
244   if (to_restore_ == nullptr) {
245     // Leave context, tls->id, and tls->context set.
246     return;
247   }
248 
249   // Set context and update thread local.
250   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
251   tls->id = to_restore_->id();
252   tls->context = to_restore_;
253 }
254 
255 namespace {
256 
257 // Returns a stringified device number associated with pointer, primarily for
258 // logging purposes. Returns "?" if the device could not be successfully
259 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)260 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
261   auto value = GpuDriver::GetPointerDevice(pointer);
262   if (value.ok()) {
263     return absl::StrCat(value.ValueOrDie());
264   }
265   LOG(ERROR) << "could not query device: " << value.status();
266   return "?";
267 }
268 
269 // Returns a stringified memory space associated with pointer, primarily for
270 // logging purposes. Returns "?" if the memory space could not be successfully
271 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)272 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
273   auto value = GpuDriver::GetPointerMemorySpace(pointer);
274   if (value.ok()) {
275     return MemorySpaceString(value.ValueOrDie());
276   }
277   LOG(ERROR) << "could not query device: " << value.status();
278   return "?";
279 }
280 
281 // Returns a stringified representation of whether or not peer access is
282 // permitted between the "from" and "to" pointers' associated contexts,
283 // primarily for logging purposes. Returns "error" if an error is encountered
284 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)285 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
286   auto from_context = GpuDriver::GetPointerContext(from);
287   if (!from_context.ok()) {
288     LOG(ERROR) << "could not retrieve source pointer's context: "
289                << from_context.status();
290     return "error";
291   }
292   auto to_context = GpuDriver::GetPointerContext(to);
293   if (!to_context.ok()) {
294     LOG(ERROR) << "could not retrieve destination pointer's context: "
295                << to_context.status();
296     return "error";
297   }
298   return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
299                                         to_context.ValueOrDie())
300              ? "true"
301              : "false";
302 }
303 
304 // Actually performs the work of CUDA initialization. Wrapped up in one-time
305 // execution guard.
InternalInit()306 static port::Status InternalInit() {
307   CUresult res = CUDA_ERROR_NO_DEVICE;
308   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
309     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
310   } else {
311     res = cuInit(0 /* = flags */);
312   }
313 
314   if (res == CUDA_SUCCESS) {
315     return port::Status::OK();
316   }
317 
318   LOG(ERROR) << "failed call to cuInit: " << ToString(res);
319   Diagnostician::LogDiagnosticInformation();
320   return port::Status(port::error::ABORTED,
321                       absl::StrCat("failed call to cuInit: ", ToString(res)));
322 }
323 
324 }  // namespace
325 
Init()326 /* static */ port::Status GpuDriver::Init() {
327   // Cached return value from calling InternalInit(), as cuInit need only be
328   // called once, but GpuDriver::Init may be called many times.
329   static port::Status init_retval;
330   static bool set = false;
331   static mutex* init_mu = new mutex;
332 
333   mutex_lock lock(*init_mu);
334   if (!set) {
335     init_retval = InternalInit();
336     set = true;
337   }
338 
339   return init_retval;
340 }
341 
GetDevice(int device_ordinal,CUdevice * device)342 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
343                                                CUdevice* device) {
344   CUresult res = cuDeviceGet(device, device_ordinal);
345   if (res == CUDA_SUCCESS) {
346     return port::Status::OK();
347   }
348 
349   return port::Status(
350       port::error::INTERNAL,
351       absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
352 }
353 
GetDeviceName(CUdevice device,string * device_name)354 /* static */ bool GpuDriver::GetDeviceName(CUdevice device,
355                                            string* device_name) {
356   static const size_t kCharLimit = 64;
357   absl::InlinedVector<char, 4> chars(kCharLimit);
358   CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
359   if (res != CUDA_SUCCESS) {
360     LOG(ERROR) << "failed to get device name for " << device << ": "
361                << ToString(res);
362     return false;
363   }
364   chars[kCharLimit - 1] = '\0';
365   *device_name = chars.begin();
366   return true;
367 }
368 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)369 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
370                                  int* flags) {
371   static_assert(DeviceOptions::kMask == 0xf,
372                 "needs update for new device options");
373 
374   if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
375     *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
376   }
377 
378   // If no flags are set the default is CU_CTX_SCHED_AUTO, which
379   // in Google environments is very likely to mean SPIN.
380   if (device_options.flags() & DeviceOptions::kScheduleSpin) {
381     *flags |= CU_CTX_SCHED_SPIN;
382   }
383   if (device_options.flags() & DeviceOptions::kScheduleYield) {
384     *flags |= CU_CTX_SCHED_YIELD;
385   }
386   if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
387     *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
388   }
389 
390   return true;
391 }
392 
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)393 /* static */ port::Status GpuDriver::CreateContext(
394     int device_ordinal, CUdevice device, const DeviceOptions& device_options,
395     GpuContext** context) {
396   *context = nullptr;
397 
398   int flags = 0;
399   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
400     LOG(WARNING) << "could not convert all device options into context flags";
401   }
402 
403   CUresult res;
404   CUcontext former_context;
405   CUcontext new_context;
406 
407   unsigned int former_primary_context_flags;
408   int former_primary_context_is_active;
409   CHECK_EQ(CUDA_SUCCESS,
410            cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
411                                       &former_primary_context_is_active));
412   if (former_primary_context_flags != flags) {
413     if (former_primary_context_is_active) {
414       LOG(ERROR)
415           << "The primary context is active and has a different flag set ("
416           << former_primary_context_flags << ") than the desired flag set ("
417           << flags << ").";
418     } else {
419       CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
420     }
421   }
422 
423   former_context = cuda::CurrentContextOrDie();
424   res = cuDevicePrimaryCtxRetain(&new_context, device);
425   if (former_context != nullptr) {
426     CUdevice former_device;
427     if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
428       if (former_device == device) {
429         if (former_context == new_context) {
430           VLOG(2) << "The primary context " << former_context << " for device "
431                   << device
432                   << " exists before initializing the StreamExecutor.";
433         } else {
434           LOG(WARNING) << "A non-primary context " << former_context
435                        << " for device " << device
436                        << " exists before initializing the StreamExecutor. The "
437                        << "primary context is now " << new_context << ". We "
438                        << "haven't verified StreamExecutor works with that.";
439         }
440       }
441     } else {
442       LOG(ERROR) << "Failed to get the device of the current context "
443                  << former_context;
444     }
445   }
446   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
447 
448   if (res == CUDA_SUCCESS) {
449     *context = CreatedContexts::Add(new_context);
450     CHECK(*context != nullptr)
451         << "success in this call must entail non-null result";
452     VLOG(2) << "created or reused context " << new_context
453             << " for this thread";
454     return port::Status::OK();
455   }
456 
457   string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
458   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
459     uint64 total_memory;
460     if (GetDeviceTotalMemory(device, &total_memory)) {
461       absl::StrAppend(&message, "; total memory reported: ", total_memory);
462     } else {
463       absl::StrAppend(&message, "; could not query total memory");
464     }
465   }
466 
467   return port::Status(port::error::INTERNAL, message);
468 }
469 
DestroyContext(GpuContext * context)470 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
471   if (context == nullptr) {
472     return;
473   }
474   CUcontext former_context = CurrentContext();
475   CUresult res = cuCtxSetCurrent(context->context());
476   CUdevice device;
477   cuCtxGetDevice(&device);
478   cuCtxSetCurrent(former_context);
479 
480   res = cuDevicePrimaryCtxRelease(device);
481 
482   if (res != CUDA_SUCCESS) {
483     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
484   }
485 
486   CreatedContexts::Remove(context->context());
487 }
488 
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)489 /* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
490                                               CUfunction func,
491                                               int* attribute_value) {
492   CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
493   if (res != CUDA_SUCCESS) {
494     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
495                << ", attribute: " << attribute;
496     return false;
497   }
498   return true;
499 }
500 
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)501 /* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
502                                                 CUfunc_cache cache_config) {
503   CUresult res = cuFuncSetCacheConfig(function, cache_config);
504   if (res != CUDA_SUCCESS) {
505     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
506                << ", config: " << cache_config << ", result: " << ToString(res);
507     return false;
508   }
509 
510   return true;
511 }
512 
513 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)514 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
515   CUsharedconfig shared_mem_config;
516   ScopedActivateContext activation(context);
517   CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
518   if (result != CUDA_SUCCESS) {
519     CUdevice device;
520     cuCtxGetDevice(&device);
521     LOG(ERROR) << "failed to get CUDA device shared memory config. "
522                << "Context device ID: " << device
523                << ", result: " << ToString(result);
524     return port::Status(
525         port::error::INTERNAL,
526         absl::StrCat("failed to get shared memory config: ", ToString(result)));
527   }
528   return shared_mem_config;
529 }
530 
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)531 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
532     GpuContext* context, CUsharedconfig shared_mem_config) {
533   ScopedActivateContext activation(context);
534   CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
535   if (result != CUDA_SUCCESS) {
536     CUdevice device;
537     cuCtxGetDevice(&device);
538     LOG(ERROR) << "failed to set CUDA device shared memory config. "
539                << "Context device ID: " << device
540                << ", config: " << shared_mem_config
541                << ", result: " << ToString(result);
542     return port::Status(
543         port::error::INTERNAL,
544         absl::StrCat("failed to set shared memory config: ", ToString(result)));
545   }
546   return port::Status::OK();
547 }
548 
LaunchKernel(GpuContext * context,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)549 /* static */ bool GpuDriver::LaunchKernel(
550     GpuContext* context, CUfunction function, unsigned int grid_dim_x,
551     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
552     unsigned int block_dim_y, unsigned int block_dim_z,
553     unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
554     void** extra) {
555   ScopedActivateContext activation(context);
556   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
557           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
558           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
559           << " bdz: " << block_dim_z;
560   CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
561                                 block_dim_x, block_dim_y, block_dim_z,
562                                 shared_mem_bytes, stream, kernel_params, extra);
563   if (res != CUDA_SUCCESS) {
564     LOG(ERROR) << "failed to launch CUDA kernel: " << function
565                << "; result: " << ToString(res);
566     return false;
567   }
568   VLOG(2) << "successfully launched kernel";
569   return true;
570 }
571 
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)572 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
573                                                const char* cubin_bytes,
574                                                CUmodule* module) {
575   ScopedActivateContext activation(context);
576   CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
577   if (result != CUDA_SUCCESS) {
578     return port::Status(port::error::INTERNAL,
579                         "failed to load in-memory CUBIN: " + ToString(result));
580   }
581 
582   return port::Status::OK();
583 }
584 
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)585 /* static */ bool GpuDriver::LoadPtx(GpuContext* context,
586                                      const char* ptx_contents,
587                                      CUmodule* module) {
588   port::Notification notification;
589   bool ret = true;
590   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
591                                  &notification]() {
592     ScopedActivateContext activation(context);
593     void* ptx_data = const_cast<char*>(ptx_contents);
594     static const unsigned int kLogBufferBytesLimit = 1024;
595     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
596     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
597     absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
598     absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
599     bool log_verbose = true;
600     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
601                               CU_JIT_ERROR_LOG_BUFFER,
602                               CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
603                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
604     // Note that the driver API wants the contents of this values to be stored
605     // in an array of void*s, so we coerce them accordingly.
606     void* option_values[] = {
607         absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
608         absl::bit_cast<void*>(error_log_buffer.data()),
609         absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
610         absl::bit_cast<void*>(info_log_buffer.data()),
611         absl::bit_cast<void*>(uintptr_t(log_verbose))};
612     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
613 
614     CUresult res;
615     {
616       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
617       // module loading: see http://b/13248943
618       absl::LeakCheckDisabler disabler;
619       res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
620                                option_values);
621     }
622 
623     // The PTX JIT mutates the values in the option values array to reflect the
624     // size of the logs it output; now that we've made the call, read the values
625     // back out.
626     error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
627     info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
628     CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
629     CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
630 
631     if (res != CUDA_SUCCESS) {
632       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
633       // As a precaution for null termination of the API-provided value, ensure
634       // that at least the last byte is null.
635       error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
636                                               : 0] = '\0';
637       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
638                  << " bytes): " << error_log_buffer.data();
639       ret = false;
640       notification.Notify();
641     }
642 
643     VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
644             << " bytes): " << info_log_buffer.data();
645     VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
646             << " bytes): " << error_log_buffer.data();
647     CHECK(module != nullptr);
648     notification.Notify();
649   });
650   notification.WaitForNotification();
651 
652   return ret;
653 }
654 
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)655 /* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
656                                        const char* hsaco_contents,
657                                        CUmodule* module) {
658   LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
659   return false;
660 }
661 
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)662 /* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
663                                                     CUdeviceptr location,
664                                                     uint8 value, size_t size) {
665   ScopedActivateContext activation(context);
666   CUresult res = cuMemsetD8(location, value, size);
667   if (res != CUDA_SUCCESS) {
668     LOG(ERROR) << "failed to memset memory: " << ToString(res);
669     return false;
670   }
671   return true;
672 }
673 
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)674 /* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
675                                                      CUdeviceptr location,
676                                                      uint32 value,
677                                                      size_t uint32_count) {
678   ScopedActivateContext activation(context);
679   CUresult res = cuMemsetD32(location, value, uint32_count);
680   if (res != CUDA_SUCCESS) {
681     LOG(ERROR) << "failed to memset memory: " << ToString(res);
682     return false;
683   }
684   return true;
685 }
686 
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)687 /* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
688                                                      CUdeviceptr location,
689                                                      uint8 value,
690                                                      size_t uint32_count,
691                                                      CUstream stream) {
692   ScopedActivateContext activation(context);
693   CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
694   if (res != CUDA_SUCCESS) {
695     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
696     return false;
697   }
698   VLOG(2) << "successfully enqueued async memset operation";
699   return true;
700 }
701 
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)702 /* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
703                                                       CUdeviceptr location,
704                                                       uint32 value,
705                                                       size_t uint32_count,
706                                                       CUstream stream) {
707   ScopedActivateContext activation(context);
708   CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
709   if (res != CUDA_SUCCESS) {
710     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
711     return false;
712   }
713   VLOG(2) << "successfully enqueued async memset operation";
714   return true;
715 }
716 
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)717 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
718                                                CUstream stream,
719                                                StreamCallback callback,
720                                                void* data) {
721   // Note: flags param is required to be zero according to CUDA 6.0.
722   CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
723   if (res != CUDA_SUCCESS) {
724     LOG(ERROR) << "unable to add host callback: " << ToString(res);
725     return false;
726   }
727   return true;
728 }
729 
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)730 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
731                                                CUmodule module,
732                                                const char* kernel_name,
733                                                CUfunction* function) {
734   ScopedActivateContext activated{context};
735   CHECK(module != nullptr && kernel_name != nullptr);
736   CUresult res = cuModuleGetFunction(function, module, kernel_name);
737   if (res != CUDA_SUCCESS) {
738     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
739                << "\" from module: " << ToString(res);
740     return false;
741   }
742 
743   return true;
744 }
745 
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)746 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
747                                              CUmodule module,
748                                              const char* symbol_name,
749                                              CUdeviceptr* dptr, size_t* bytes) {
750   ScopedActivateContext activated{context};
751   CHECK(module != nullptr && symbol_name != nullptr &&
752         (dptr != nullptr || bytes != nullptr));
753   CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
754   if (res != CUDA_SUCCESS) {
755     // symbol may not be found in the current module, but it may reside in
756     // another module.
757     VLOG(2) << "failed to get symbol \"" << symbol_name
758             << "\" from module: " << ToString(res);
759     return false;
760   }
761 
762   return true;
763 }
764 
UnloadModule(GpuContext * context,CUmodule module)765 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
766                                           CUmodule module) {
767   ScopedActivateContext activated{context};
768   CUresult res = cuModuleUnload(module);
769   if (res != CUDA_SUCCESS) {
770     LOG(ERROR) << "failed to unload module " << module
771                << "; leaking: " << ToString(res);
772   }
773 }
774 
DeviceFromContext(GpuContext * context)775 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
776     GpuContext* context) {
777   ScopedActivateContext activated{context};
778   CUdevice device = -1;
779   CUresult result = cuCtxGetDevice(&device);
780   if (result == CUDA_SUCCESS) {
781     return device;
782   }
783 
784   return port::Status(
785       port::error::INTERNAL,
786       absl::StrCat("failed to get device for context: ", ToString(result)));
787 }
788 
CreateStream(GpuContext * context,CUstream * stream)789 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
790                                           CUstream* stream) {
791   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
792   // up synchronization with respect to memsets and any other things that have
793   // to occur on the default stream?
794   ScopedActivateContext activated{context};
795   CUresult res = cuStreamCreate(stream, 0);
796   if (res != CUDA_SUCCESS) {
797     LOG(ERROR) << "could not allocate CUDA stream for context "
798                << context->context() << ": " << ToString(res);
799     return false;
800   }
801 
802   VLOG(2) << "successfully created stream " << *stream << " for context "
803           << context->context() << " on thread";
804   return true;
805 }
806 
DestroyStream(GpuContext * context,CUstream * stream)807 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
808                                            CUstream* stream) {
809   if (*stream == nullptr) {
810     return;
811   }
812 
813   ScopedActivateContext activated{context};
814   CUresult res = cuStreamDestroy(*stream);
815   if (res != CUDA_SUCCESS) {
816     LOG(ERROR) << "failed to destroy CUDA stream for context "
817                << context->context() << ": " << ToString(res);
818   } else {
819     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
820             << context->context();
821     *stream = nullptr;
822   }
823 }
824 
DeviceAllocate(GpuContext * context,uint64 bytes)825 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
826                                              uint64 bytes) {
827   if (bytes == 0) {
828     return nullptr;
829   }
830 
831   ScopedActivateContext activated{context};
832   CUdeviceptr result = 0;
833   CUresult res = cuMemAlloc(&result, bytes);
834   if (res != CUDA_SUCCESS) {
835     LOG(ERROR) << "failed to allocate "
836                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
837                << " bytes) from device: " << ToString(res);
838     return nullptr;
839   }
840   void* ptr = reinterpret_cast<void*>(result);
841   VLOG(2) << "allocated " << ptr << " for context " << context->context()
842           << " of " << bytes << " bytes";
843   return ptr;
844 }
845 
DeviceDeallocate(GpuContext * context,void * location)846 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
847                                               void* location) {
848   ScopedActivateContext activation(context);
849   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
850   CUresult res = cuMemFree(pointer);
851   if (res != CUDA_SUCCESS) {
852     LOG(ERROR) << "failed to free device memory at " << location
853                << "; result: " << ToString(res);
854   } else {
855     VLOG(2) << "deallocated " << location << " for context "
856             << context->context();
857   }
858 }
859 
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)860 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
861                                                     uint64 bytes) {
862   ScopedActivateContext activation(context);
863   CUdeviceptr result = 0;
864   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
865   CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
866   if (res != CUDA_SUCCESS) {
867     LOG(ERROR) << "failed to alloc " << bytes
868                << " bytes unified memory; result: " << ToString(res);
869     return nullptr;
870   }
871   void* ptr = reinterpret_cast<void*>(result);
872   VLOG(2) << "allocated " << ptr << " for context " << context->context()
873           << " of " << bytes << " bytes in unified memory";
874   return ptr;
875 }
876 
UnifiedMemoryDeallocate(GpuContext * context,void * location)877 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
878                                                      void* location) {
879   ScopedActivateContext activation(context);
880   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
881   CUresult res = cuMemFree(pointer);
882   if (res != CUDA_SUCCESS) {
883     LOG(ERROR) << "failed to free unified memory at " << location
884                << "; result: " << ToString(res);
885   } else {
886     VLOG(2) << "deallocated unified memory at " << location << " for context "
887             << context->context();
888   }
889 }
890 
HostAllocate(GpuContext * context,uint64 bytes)891 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
892   ScopedActivateContext activation(context);
893   void* host_mem = nullptr;
894   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
895   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
896   if (res != CUDA_SUCCESS) {
897     LOG(ERROR) << "failed to alloc " << bytes
898                << " bytes on host: " << ToString(res);
899   }
900   return host_mem;
901 }
902 
HostDeallocate(GpuContext * context,void * location)903 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
904                                             void* location) {
905   ScopedActivateContext activation(context);
906   CUresult res = cuMemFreeHost(location);
907   if (res != CUDA_SUCCESS) {
908     LOG(ERROR) << "error deallocating host memory at " << location << ": "
909                << ToString(res);
910   }
911 }
912 
HostRegister(GpuContext * context,void * location,uint64 bytes)913 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
914                                           uint64 bytes) {
915   ScopedActivateContext activation(context);
916   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
917   CUresult res =
918       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
919   if (res != CUDA_SUCCESS) {
920     LOG(ERROR) << "error registering host memory at " << location << ": "
921                << ToString(res);
922     return false;
923   }
924   return true;
925 }
926 
HostUnregister(GpuContext * context,void * location)927 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
928                                             void* location) {
929   ScopedActivateContext activation(context);
930   CUresult res = cuMemHostUnregister(location);
931   if (res != CUDA_SUCCESS) {
932     LOG(ERROR) << "error unregistering host memory at " << location << ": "
933                << ToString(res);
934     return false;
935   }
936   return true;
937 }
938 
DestroyEvent(GpuContext * context,CUevent * event)939 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
940                                                   CUevent* event) {
941   if (*event == nullptr) {
942     return port::Status(port::error::INVALID_ARGUMENT,
943                         "input event cannot be null");
944   }
945 
946   ScopedActivateContext activated{context};
947   CUresult res = cuEventDestroy(*event);
948   *event = nullptr;
949 
950   switch (res) {
951     case CUDA_SUCCESS:
952       return port::Status::OK();
953     case CUDA_ERROR_DEINITIALIZED:
954     case CUDA_ERROR_NOT_INITIALIZED:
955       return port::Status(
956           port::error::FAILED_PRECONDITION,
957           port::Printf("error destroying CUDA event in context %p: %s", context,
958                        ToString(res).c_str()));
959     default:
960       return port::Status(
961           port::error::INTERNAL,
962           port::Printf("error destroying CUDA event in context %p: %s", context,
963                        ToString(res).c_str()));
964   }
965 }
966 
RecordEvent(GpuContext * context,CUevent event,CUstream stream)967 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
968                                                  CUevent event,
969                                                  CUstream stream) {
970   ScopedActivateContext activated{context};
971   CUresult res = cuEventRecord(event, stream);
972   switch (res) {
973     case CUDA_SUCCESS:
974       return port::Status::OK();
975     case CUDA_ERROR_DEINITIALIZED:
976     case CUDA_ERROR_NOT_INITIALIZED:
977       return port::Status(
978           port::error::FAILED_PRECONDITION,
979           port::Printf("error recording CUDA event on stream %p: %s", stream,
980                        ToString(res).c_str()));
981     default:
982       return port::Status(
983           port::error::INVALID_ARGUMENT,
984           port::Printf("error recording CUDA event on stream %p: %s", stream,
985                        ToString(res).c_str()));
986   }
987 }
988 
QueryEvent(GpuContext * context,CUevent event)989 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
990                                                             CUevent event) {
991   ScopedActivateContext activated{context};
992   CUresult res = cuEventQuery(event);
993   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
994     return port::Status(
995         port::error::INTERNAL,
996         port::Printf("failed to query event: %s", ToString(res).c_str()));
997   }
998 
999   return res;
1000 }
1001 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1002 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1003                                                  float* elapsed_milliseconds,
1004                                                  CUevent start, CUevent stop) {
1005   ScopedActivateContext activated{context};
1006   // The stop event must have completed in order for cuEventElapsedTime to
1007   // work.
1008   CUresult res = cuEventSynchronize(stop);
1009   if (res != CUDA_SUCCESS) {
1010     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1011     return false;
1012   }
1013   res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1014   if (res != CUDA_SUCCESS) {
1015     LOG(ERROR) << "failed to get elapsed time between events: "
1016                << ToString(res);
1017     return false;
1018   }
1019 
1020   return true;
1021 }
1022 
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1023 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1024                                                CUstream stream, CUevent event) {
1025   ScopedActivateContext activation(context);
1026   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1027   if (res != CUDA_SUCCESS) {
1028     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1029     return false;
1030   }
1031 
1032   return true;
1033 }
1034 
SynchronizeContext(GpuContext * context)1035 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1036   ScopedActivateContext activation(context);
1037   CUresult res = cuCtxSynchronize();
1038   if (res != CUDA_SUCCESS) {
1039     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1040                << " :: " << port::CurrentStackTrace();
1041     return false;
1042   }
1043 
1044   return true;
1045 }
1046 
SynchronizeStream(GpuContext * context,CUstream stream)1047 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1048                                                        CUstream stream) {
1049   ScopedActivateContext activated{context};
1050   CHECK(stream != nullptr);
1051   CUresult res = cuStreamSynchronize(stream);
1052   if (res != CUDA_SUCCESS) {
1053     port::Status status = port::InternalError(
1054         absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
1055     LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
1056     return status;
1057   }
1058   VLOG(2) << "successfully synchronized stream " << stream << " on context "
1059           << context;
1060   return port::Status::OK();
1061 }
1062 
IsStreamIdle(GpuContext * context,CUstream stream)1063 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1064                                           CUstream stream) {
1065   ScopedActivateContext activated{context};
1066   CHECK(stream != nullptr);
1067   CUresult res = cuStreamQuery(stream);
1068   if (res == CUDA_SUCCESS) {
1069     return true;
1070   }
1071 
1072   if (res != CUDA_ERROR_NOT_READY) {
1073     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1074   }
1075   return false;
1076 }
1077 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1078 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1079                                                           void* host_dst,
1080                                                           CUdeviceptr gpu_src,
1081                                                           uint64 size) {
1082   ScopedActivateContext activation(context);
1083   if (size > 0) {
1084     CHECK(PointerIsValid(gpu_src))
1085         << "Source pointer is not actually on GPU: " << gpu_src;
1086     CHECK(PointerIsValid(host_dst))
1087         << "Destination pointer is not actually on CPU: " << host_dst;
1088   }
1089   CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
1090   if (res != CUDA_SUCCESS) {
1091     return port::InternalError(
1092         port::Printf("failed to synchronous memcpy from device to host: %s; "
1093                      "host dst: %p; GPU src: %p; size: %llu=0x%llx",
1094                      ToString(res).c_str(), host_dst,
1095                      absl::bit_cast<void*>(gpu_src), size, size));
1096   }
1097   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1098           << host_dst;
1099   return port::Status::OK();
1100 }
1101 
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1102 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1103                                                           CUdeviceptr gpu_dst,
1104                                                           const void* host_src,
1105                                                           uint64 size) {
1106   ScopedActivateContext activation(context);
1107   if (size > 0) {
1108     CHECK(PointerIsValid(host_src))
1109         << "Source pointer is not actually on CPU: " << host_src;
1110     CHECK(PointerIsValid(gpu_dst))
1111         << "Destination pointer is not actually on GPU: " << gpu_dst;
1112   }
1113   CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
1114   if (res != CUDA_SUCCESS) {
1115     return port::InternalError(port::Printf(
1116         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
1117         " host src: %p; size: %llu=0x%llx",
1118         ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
1119         size));
1120   }
1121   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1122   return port::Status::OK();
1123 }
1124 
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1125 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1126                                                           CUdeviceptr gpu_dst,
1127                                                           CUdeviceptr gpu_src,
1128                                                           uint64 size) {
1129   ScopedActivateContext activation(context);
1130   if (size > 0) {
1131     CHECK(PointerIsValid(gpu_src))
1132         << "Source pointer is not actually on GPU: " << gpu_src;
1133     CHECK(PointerIsValid(gpu_dst))
1134         << "Destination pointer is not actually on GPU: " << gpu_dst;
1135   }
1136   CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1137   if (res != CUDA_SUCCESS) {
1138     return port::InternalError(port::Printf(
1139         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
1140         "GPU src: %p; size: %llu=0x%llx",
1141         ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
1142         absl::bit_cast<void*>(gpu_src), size, size));
1143   }
1144   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1145   return port::Status::OK();
1146 }
1147 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1148 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1149                                                    void* host_dst,
1150                                                    CUdeviceptr gpu_src,
1151                                                    uint64 size,
1152                                                    CUstream stream) {
1153   ScopedActivateContext activation(context);
1154   if (size > 0) {
1155     CHECK(PointerIsValid(gpu_src))
1156         << "Source pointer is not actually on GPU: " << gpu_src;
1157     CHECK(PointerIsValid(host_dst))
1158         << "Destination pointer is not actually on CPU: " << host_dst;
1159   }
1160   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1161   if (res != CUDA_SUCCESS) {
1162     LOG(ERROR) << port::Printf(
1163         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1164         "GPU src: %p; size: %llu=0x%llx",
1165         ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
1166         size);
1167     return false;
1168   }
1169   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1170           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1171           << host_dst << " on stream " << stream;
1172   return true;
1173 }
1174 
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1175 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1176                                                    CUdeviceptr gpu_dst,
1177                                                    const void* host_src,
1178                                                    uint64 size,
1179                                                    CUstream stream) {
1180   ScopedActivateContext activation(context);
1181   if (size > 0) {
1182     CHECK(PointerIsValid(host_src))
1183         << "Source pointer is not actually on CPU: " << host_src;
1184     CHECK(PointerIsValid(gpu_dst))
1185         << "Destination pointer is not actually on GPU: " << gpu_dst;
1186   }
1187   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1188   if (res != CUDA_SUCCESS) {
1189     LOG(ERROR) << port::Printf(
1190         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1191         "host src: %p; size: %llu=0x%llx",
1192         ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
1193         size);
1194     return false;
1195   }
1196   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1197           << " on stream " << stream;
1198   return true;
1199 }
1200 
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1201 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1202                                                    CUdeviceptr gpu_dst,
1203                                                    CUdeviceptr gpu_src,
1204                                                    uint64 size,
1205                                                    CUstream stream) {
1206   ScopedActivateContext activation(context);
1207   if (size > 0) {
1208     CHECK(PointerIsValid(gpu_src))
1209         << "Source pointer is not actually on GPU: " << gpu_src;
1210     CHECK(PointerIsValid(gpu_dst))
1211         << "Destination pointer is not actually on GPU: " << gpu_dst;
1212   }
1213   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1214   if (result != CUDA_SUCCESS) {
1215     LOG(ERROR) << port::Printf(
1216         "failed to enqueue async memcpy from device to device: %s"
1217         "; GPU dst: %p on %s %s"
1218         "; GPU src: %p on %s %s"
1219         "; can access? %s; size: %llu=0x%llx",
1220         ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
1221         CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
1222         CUDAPointerToDeviceString(gpu_dst).c_str(),
1223         absl::bit_cast<void*>(gpu_src),
1224         CUDAPointerToMemorySpaceString(gpu_src).c_str(),
1225         CUDAPointerToDeviceString(gpu_src).c_str(),
1226         CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
1227 
1228     return false;
1229   }
1230   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1231   return true;
1232 }
1233 
CreateEvent(GpuContext * context,CUevent * result,EventFlags flags)1234 /* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
1235                                                  CUevent* result,
1236                                                  EventFlags flags) {
1237   int cuflags;
1238   switch (flags) {
1239     case EventFlags::kDefault:
1240       cuflags = CU_EVENT_DEFAULT;
1241       break;
1242     case EventFlags::kDisableTiming:
1243       cuflags = CU_EVENT_DISABLE_TIMING;
1244       break;
1245     default:
1246       LOG(FATAL) << "impossible event flags: " << int(flags);
1247   }
1248 
1249   ScopedActivateContext activated{context};
1250   CUresult res = cuEventCreate(result, cuflags);
1251 
1252   if (res == CUDA_SUCCESS) {
1253     return port::Status::OK();
1254   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1255     return port::Status(port::error::RESOURCE_EXHAUSTED,
1256                         "could not create CUDA event: out of device memory");
1257   } else {
1258     return port::Status(
1259         port::error::FAILED_PRECONDITION,
1260         absl::StrCat("could not create CUDA event: ", ToString(res)));
1261   }
1262 }
1263 
GetDeviceCount()1264 /* static */ int GpuDriver::GetDeviceCount() {
1265   int device_count = 0;
1266   CUresult res = cuDeviceGetCount(&device_count);
1267   if (res != CUDA_SUCCESS) {
1268     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1269     return 0;
1270   }
1271 
1272   if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1273     device_count = 1;
1274   }
1275   return device_count;
1276 }
1277 
GetPointerContext(CUdeviceptr pointer)1278 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1279     CUdeviceptr pointer) {
1280   GpuContext* context = nullptr;
1281   CUresult result =
1282       cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1283   if (result == CUDA_SUCCESS) {
1284     CHECK(context != nullptr) << "success should entail non-null context";
1285     return context;
1286   }
1287 
1288   return port::Status(
1289       port::error::INTERNAL,
1290       absl::StrCat("failed to query device pointer for context: ",
1291                    ToString(result)));
1292 }
1293 
GetPointerMemorySpace(CUdeviceptr pointer)1294 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1295     CUdeviceptr pointer) {
1296   unsigned int value;
1297   CUresult result =
1298       cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1299   if (result == CUDA_SUCCESS) {
1300     switch (value) {
1301       case CU_MEMORYTYPE_DEVICE:
1302         return MemorySpace::kDevice;
1303       case CU_MEMORYTYPE_HOST:
1304         return MemorySpace::kHost;
1305       default:
1306         return port::Status(
1307             port::error::INTERNAL,
1308             absl::StrCat("unknown memory space provided by CUDA API: ", value));
1309     }
1310   }
1311 
1312   return port::Status(
1313       port::error::INTERNAL,
1314       absl::StrCat("failed to query device pointer for memory space: ",
1315                    ToString(result)));
1316 }
1317 
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1318 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1319                                                             CUdeviceptr* base,
1320                                                             size_t* size) {
1321   CUresult result = cuMemGetAddressRange(base, size, dptr);
1322   if (result == CUDA_SUCCESS) {
1323     return port::Status::OK();
1324   } else if (result == CUDA_ERROR_NOT_FOUND) {
1325     // We differentiate between "this pointer is unknown" (return here) and
1326     // "there was an internal error while performing this operation" (return
1327     // below).
1328     return port::Status(
1329         port::error::NOT_FOUND,
1330         port::Printf("not a device pointer %p; %s",
1331                      reinterpret_cast<void*>(dptr), ToString(result).c_str()));
1332   }
1333 
1334   return port::Status(
1335       port::error::INTERNAL,
1336       port::Printf("failed to get pointer into for device pointer %p; %s",
1337                    reinterpret_cast<void*>(dptr), ToString(result).c_str()));
1338 }
1339 
GetPointerDevice(CUdeviceptr pointer)1340 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1341     CUdeviceptr pointer) {
1342   auto result = GetPointerContext(pointer);
1343   if (!result.ok()) {
1344     return result.status();
1345   }
1346 
1347   return DeviceFromContext(result.ValueOrDie());
1348 }
1349 
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1350 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1351                                                           int* cc_minor,
1352                                                           CUdevice device) {
1353   *cc_major = 0;
1354   *cc_minor = 0;
1355 
1356   CUresult res = cuDeviceGetAttribute(
1357       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1358   if (res != CUDA_SUCCESS) {
1359     return port::Status(
1360         port::error::INTERNAL,
1361         port::Printf(
1362             "failed to get compute capability major for device: %s; %d",
1363             ToString(res).c_str(), device));
1364   }
1365 
1366   res = cuDeviceGetAttribute(
1367       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1368   if (res != CUDA_SUCCESS) {
1369     return port::Status(
1370         port::error::INTERNAL,
1371         port::Printf(
1372             "failed to get compute capability minor for device: %s; %d",
1373             ToString(res).c_str(), device));
1374   }
1375 
1376   return port::Status::OK();
1377 }
1378 
GetGpuISAVersion(int * version,CUdevice device)1379 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1380                                                       CUdevice device) {
1381   return port::Status{
1382       port::error::INTERNAL,
1383       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1384 }
1385 
1386 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1387 // T and wraps it in a StatusOr.
1388 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1389 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1390                                             CUdevice_attribute attribute) {
1391   int value = -1;
1392   CUresult result = cuDeviceGetAttribute(&value, attribute, device);
1393   if (result != CUDA_SUCCESS) {
1394     return port::Status(
1395         port::error::NOT_FOUND,
1396         absl::StrCat("could not retrieve CUDA device attribute (", attribute,
1397                      "): ", ToString(result)));
1398   }
1399   T converted = value;
1400   return converted;
1401 }
1402 
GetMultiprocessorCount(CUdevice device)1403 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1404     CUdevice device) {
1405   return GetSimpleAttribute<int>(device,
1406                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1407 }
1408 
GetMaxSharedMemoryPerCore(CUdevice device)1409 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1410     CUdevice device) {
1411   return GetSimpleAttribute<int64>(
1412       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1413 }
1414 
GetMaxSharedMemoryPerBlock(CUdevice device)1415 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1416     CUdevice device) {
1417   return GetSimpleAttribute<int64>(
1418       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1419 }
1420 
GetMaxThreadsPerMultiprocessor(CUdevice device)1421 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1422     CUdevice device) {
1423   return GetSimpleAttribute<int64>(
1424       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1425 }
1426 
GetMaxThreadsPerBlock(CUdevice device)1427 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1428     CUdevice device) {
1429   return GetSimpleAttribute<int64>(device,
1430                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1431 }
1432 
GetMaxRegistersPerBlock(CUdevice device)1433 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1434     CUdevice device) {
1435   return GetSimpleAttribute<int64>(device,
1436                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1437 }
1438 
GetThreadsPerWarp(CUdevice device)1439 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1440     CUdevice device) {
1441   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1442 }
1443 
GetGridLimits(int * x,int * y,int * z,CUdevice device)1444 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1445                                            CUdevice device) {
1446   int value;
1447   CUresult res =
1448       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1449   if (res != CUDA_SUCCESS) {
1450     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1451     return false;
1452   }
1453   *x = value;
1454 
1455   res =
1456       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1457   if (res != CUDA_SUCCESS) {
1458     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1459     return false;
1460   }
1461   *y = value;
1462 
1463   res =
1464       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1465   if (res != CUDA_SUCCESS) {
1466     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1467     return false;
1468   }
1469   *z = value;
1470   return true;
1471 }
1472 
GetDriverVersion(int * driver_version)1473 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1474   CUresult res = cuDriverGetVersion(driver_version);
1475   if (res != CUDA_SUCCESS) {
1476     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1477     return false;
1478   }
1479 
1480   return true;
1481 }
1482 
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1483 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1484                                                  int device_ordinal) {
1485   CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1486   if (res != CUDA_SUCCESS) {
1487     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1488     return false;
1489   }
1490 
1491   return true;
1492 }
1493 
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1494 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1495     CUdevice_attribute attribute, CUdevice device) {
1496   int val;
1497   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1498   if (res != CUDA_SUCCESS) {
1499     return port::Status(
1500         port::error::INTERNAL,
1501         port::Printf("failed to get device attribute %d for device %d: %s",
1502                      attribute, device, ToString(res).c_str()));
1503   }
1504   return val;
1505 }
1506 
IsEccEnabled(CUdevice device,bool * result)1507 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1508   int value = -1;
1509   CUresult res =
1510       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1511   if (res != CUDA_SUCCESS) {
1512     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1513     return false;
1514   }
1515 
1516   *result = value;
1517   return true;
1518 }
1519 
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1520 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1521                                                  int64* free_out,
1522                                                  int64* total_out) {
1523   ScopedActivateContext activation(context);
1524   size_t free = 0;
1525   size_t total = 0;
1526   CUresult res = cuMemGetInfo(&free, &total);
1527   if (res != CUDA_SUCCESS) {
1528     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1529     return false;
1530   }
1531 
1532   *free_out = free;
1533   *total_out = total;
1534   return true;
1535 }
1536 
GetDeviceTotalMemory(CUdevice device,uint64 * result)1537 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1538                                                   uint64* result) {
1539   size_t value = -1;
1540   CUresult res = cuDeviceTotalMem(&value, device);
1541   if (res != CUDA_SUCCESS) {
1542     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1543     return false;
1544   }
1545 
1546   *result = value;
1547   return true;
1548 }
1549 
GetPCIBusID(CUdevice device)1550 /* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
1551   string pci_bus_id;
1552   static const int kBufferSize = 64;
1553   absl::InlinedVector<char, 4> chars(kBufferSize);
1554   chars[kBufferSize - 1] = '\0';
1555   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1556   if (res != CUDA_SUCCESS) {
1557     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1558     return pci_bus_id;
1559   }
1560   pci_bus_id = chars.begin();
1561   return pci_bus_id;
1562 }
1563 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1564 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1565                                                  GpuContext* to) {
1566   if (from == to) {
1567     return true;  // A context can always access its own memory.
1568   }
1569 
1570   int can_access_peer = -1;
1571   auto from_device = DeviceFromContext(from);
1572   if (!from_device.ok()) {
1573     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1574                << from_device.status();
1575     return false;
1576   }
1577   auto to_device = DeviceFromContext(to);
1578   if (!to_device.ok()) {
1579     LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1580                << to_device.status();
1581     return false;
1582   }
1583   CUresult res = cuDeviceCanAccessPeer(
1584       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
1585   if (res != CUDA_SUCCESS) {
1586     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1587     return false;
1588   }
1589 
1590   return can_access_peer;
1591 }
1592 
EnablePeerAccess(GpuContext * from,GpuContext * to)1593 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1594                                                       GpuContext* to) {
1595   if (from == to) {
1596     return port::Status::OK();  // A context can always access its own memory.
1597   }
1598 
1599   ScopedActivateContext activated{from};
1600   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1601   if (result != CUDA_SUCCESS &&
1602       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1603     return port::Status(
1604         port::error::INTERNAL,
1605         port::Printf("failed to enable peer access from %p to %p: %s", from, to,
1606                      ToString(result).c_str()));
1607   }
1608 
1609   return port::Status::OK();
1610 }
1611 
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1612 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1613     GpuContext* context, CUfunction kernel, int threads_per_block,
1614     size_t dynamic_shared_memory_bytes) {
1615   ScopedActivateContext activation(context);
1616 
1617   int max_blocks;
1618   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
1619       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
1620   if (result != CUDA_SUCCESS) {
1621     return port::Status(
1622         port::error::INTERNAL,
1623         port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
1624                      ToString(result).c_str()));
1625   }
1626 
1627   return max_blocks;
1628 }
1629 
1630 }  // namespace gpu
1631 
1632 namespace cuda {
1633 
CurrentContextOrDie()1634 CUcontext CurrentContextOrDie() {
1635   CUcontext current = nullptr;
1636   CUresult result = cuCtxGetCurrent(&current);
1637   if (result != CUDA_SUCCESS) {
1638     LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
1639   }
1640   return current;
1641 }
1642 
1643 }  // namespace cuda
1644 }  // namespace stream_executor
1645