1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
17 
18 #include <stdint.h>
19 #include <stdlib.h>
20 
21 #include <map>
22 #include <set>
23 #include <utility>
24 
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/stream_executor/lib/env.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/human_readable.h"
39 #include "tensorflow/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/stream_executor/lib/threadpool.h"
42 #include "tensorflow/stream_executor/platform/logging.h"
43 #include "tensorflow/stream_executor/platform/port.h"
44 
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48 
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...)                            \
50   do {                                                                 \
51     CUresult _res = (expr);                                            \
52     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                      \
53       return port::InternalError(absl::StrCat(                         \
54           __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55     }                                                                  \
56   } while (0)
57 
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...)                   \
59   do {                                                      \
60     CUresult _res = (expr);                                 \
61     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {           \
62       LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": "       \
63                  << ::stream_executor::gpu::ToString(_res); \
64     }                                                       \
65   } while (0)
66 
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70 
71 namespace stream_executor {
72 namespace gpu {
73 namespace {
74 
75 // Manages the singleton map of contexts that we've created, mapping
76 // from the CUcontext to the GpuContext* that we pass around internally.
77 // This also manages assignment of unique ids to GpuContexts, to allow
78 // for fast comparison of a context against the current context.
79 //
80 // CUDA-runtime-created contexts are avoided, if triple angle
81 // brace launches are required, by using the scoped activations in
82 // gpu/gpu_activation.h.
83 class CreatedContexts {
84  public:
85   // Returns whether context is a member of the live set.
Has(CUcontext context)86   static bool Has(CUcontext context) {
87     absl::ReaderMutexLock lock(&mu_);
88     return Live()->find(context) != Live()->end();
89   }
90 
91   // Adds context to the live set, or returns it if it's already present.
Add(CUcontext context)92   static GpuContext* Add(CUcontext context) {
93     CHECK(context != nullptr);
94     absl::MutexLock lock(&mu_);
95     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
96     auto it = insert_result.first;
97     if (insert_result.second) {
98       // context was not present in the map.  Add it.
99       it->second = absl::make_unique<GpuContext>(context, next_id_++);
100     }
101     return it->second.get();
102   }
103 
104   // Removes context from the live set.
Remove(CUcontext context)105   static void Remove(CUcontext context) {
106     CHECK(context != nullptr);
107     absl::MutexLock lock(&mu_);
108     auto it = Live()->find(context);
109     CHECK(it != Live()->end()) << context;
110     Live()->erase(it);
111   }
112 
113  private:
114   // Returns the live map singleton.
Live()115   static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
116     static auto singleton =
117         new std::map<CUcontext, std::unique_ptr<GpuContext>>;
118     return singleton;
119   }
120 
121   // Lock that guards access-to/mutation-of the live set.
122   static absl::Mutex mu_;
123   static int64 next_id_;
124 };
125 
126 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
127 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
128 
129 // Formats CUresult to output prettified values into a log stream.
ToString(CUresult result)130 std::string ToString(CUresult result) {
131   const char* error_name;
132   if (cuGetErrorName(result, &error_name)) {
133     return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
134   }
135   const char* error_string;
136   if (cuGetErrorString(result, &error_string)) {
137     return error_name;
138   }
139   return absl::StrCat(error_name, ": ", error_string);
140 }
141 
142 // Returns the current context and checks that it is in the set of CUDA contexts
143 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
144 // context behind our backs).
CurrentContext()145 CUcontext CurrentContext() {
146   CUcontext current = cuda::CurrentContextOrDie();
147   if (current != nullptr && !CreatedContexts::Has(current)) {
148     LOG(FATAL) << "current context was not created by the StreamExecutor "
149                   "cuda_driver API: "
150                << current
151                << "; a CUDA runtime call "
152                   "was likely performed without using a StreamExecutor context";
153   }
154   return current;
155 }
156 
157 // CUDA driver routines may require a large amount of stack (particularly
158 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
159 // stack-limited threads (such as those spawned by a default-argument
160 // thread::ThreadPool on some platforms), we run certain routines in this pool
161 // and wait for completion.
GetDriverExecutor()162 port::ThreadPool* GetDriverExecutor() {
163   static port::ThreadPool* thread_pool = new port::ThreadPool(
164       port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
165   return thread_pool;
166 }
167 
168 }  // namespace
169 
MemorySpaceString(MemorySpace memory_space)170 std::string MemorySpaceString(MemorySpace memory_space) {
171   switch (memory_space) {
172     case MemorySpace::kHost:
173       return "host";
174     case MemorySpace::kDevice:
175       return "device";
176     default:
177       LOG(FATAL) << "impossible memory space";
178   }
179 }
180 
181 namespace {
182 
183 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()184 void SynchronizeOrDie() {
185   FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
186                          "Synchronize fail: ", port::CurrentStackTrace());
187 }
188 
189 struct ThreadLocalData {
190   int64 id;
191   GpuContext* context;  // Only valid if id == a known good context.
192   int depth;
193 };
194 
195 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
196 
197 }  // namespace
198 
ScopedActivateContext(GpuContext * cuda_context)199 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
200   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
201 
202   auto* tls = &tls_data.get();
203 
204   // If this is an outermost scope, we must not assume that the CUDA context has
205   // been left in the same state we left it. Other code may have run on this
206   // thread and altered the context.
207   if (tls->depth == 0) {
208     VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
209     FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
210                            "Failed setting context");
211     tls->depth = 1;
212     tls->id = cuda_context->id();
213     tls->context = cuda_context;
214     to_restore_ = nullptr;
215     return;
216   }
217 
218   tls->depth++;
219   if (tls->id == cuda_context->id()) {
220     if (kVerifyGpuContext) {
221       CHECK_EQ(CurrentContext(), cuda_context->context());
222     }
223     DCHECK_EQ(CurrentContext(), cuda_context->context());
224     return;
225   }
226 
227   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
228           << " to " << cuda_context->id();
229 
230   to_restore_ = tls->context;
231   // Set the context and update thread local.
232   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
233                          "Failed setting context");
234   tls->id = cuda_context->id();
235   tls->context = cuda_context;
236 }
237 
~ScopedActivateContext()238 ScopedActivateContext::~ScopedActivateContext() {
239   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
240 
241   auto* tls = &tls_data.get();
242 
243   if (kVerifyGpuContext) {
244     // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
245     // possible this could fail in the CurrentContext() call.
246     CHECK_EQ(CurrentContext(),
247              tls->context == nullptr ? nullptr : tls->context->context());
248   }
249 
250   tls->depth--;
251   DCHECK_GE(tls->depth, 0);
252   if (to_restore_ == nullptr) {
253     // Leave context, tls->id, and tls->context set.
254     return;
255   }
256 
257   // Set context and update thread local.
258   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
259                          "Failed setting context");
260   tls->id = to_restore_->id();
261   tls->context = to_restore_;
262 }
263 
264 namespace {
265 
266 // Returns a stringified device number associated with pointer, primarily for
267 // logging purposes. Returns "?" if the device could not be successfully
268 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)269 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
270   auto value = GpuDriver::GetPointerDevice(pointer);
271   if (value.ok()) {
272     return absl::StrCat(value.ValueOrDie());
273   }
274   LOG(ERROR) << "could not query device: " << value.status();
275   return "?";
276 }
277 
278 // Returns a stringified memory space associated with pointer, primarily for
279 // logging purposes. Returns "?" if the memory space could not be successfully
280 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)281 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
282   auto value = GpuDriver::GetPointerMemorySpace(pointer);
283   if (value.ok()) {
284     return MemorySpaceString(value.ValueOrDie());
285   }
286   LOG(ERROR) << "could not query device: " << value.status();
287   return "?";
288 }
289 
290 // Returns a stringified representation of whether or not peer access is
291 // permitted between the "from" and "to" pointers' associated contexts,
292 // primarily for logging purposes. Returns "error" if an error is encountered
293 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)294 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
295   auto from_context = GpuDriver::GetPointerContext(from);
296   if (!from_context.ok()) {
297     LOG(ERROR) << "could not retrieve source pointer's context: "
298                << from_context.status();
299     return "error";
300   }
301   auto to_context = GpuDriver::GetPointerContext(to);
302   if (!to_context.ok()) {
303     LOG(ERROR) << "could not retrieve destination pointer's context: "
304                << to_context.status();
305     return "error";
306   }
307   return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
308                                         to_context.ValueOrDie())
309              ? "true"
310              : "false";
311 }
312 
313 // Actually performs the work of CUDA initialization. Wrapped up in one-time
314 // execution guard.
InternalInit()315 static port::Status InternalInit() {
316   CUresult res = CUDA_ERROR_NO_DEVICE;
317   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
318     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
319   } else {
320     res = cuInit(0 /* = flags */);
321   }
322 
323   if (res == CUDA_SUCCESS) {
324     return port::Status::OK();
325   } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
326     LOG(WARNING) << "failed call to cuInit: " << ToString(res);
327   } else {
328     LOG(ERROR) << "failed call to cuInit: " << ToString(res);
329   }
330 
331   Diagnostician::LogDiagnosticInformation();
332   return port::Status(port::error::ABORTED,
333                       absl::StrCat("failed call to cuInit: ", ToString(res)));
334 }
335 
336 }  // namespace
337 
Init()338 /* static */ port::Status GpuDriver::Init() {
339   // Cached return value from calling InternalInit(), as cuInit need only be
340   // called once, but GpuDriver::Init may be called many times.
341   static port::Status* init_retval = [] {
342     return new port::Status(InternalInit());
343   }();
344   return *init_retval;
345 }
346 
GetDevice(int device_ordinal,CUdevice * device)347 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
348                                                CUdevice* device) {
349   RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
350                            "Failed call to cuDeviceGet");
351   return port::Status::OK();
352 }
353 
GetDeviceName(CUdevice device,std::string * device_name)354 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
355                                                    std::string* device_name) {
356   static const size_t kCharLimit = 64;
357   absl::InlinedVector<char, 4> chars(kCharLimit);
358   RETURN_IF_CUDA_RES_ERROR(
359       cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
360       "Failed to get device name");
361   chars[kCharLimit - 1] = '\0';
362   *device_name = chars.begin();
363   return port::Status::OK();
364 }
365 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)366 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
367                                  int* flags) {
368   static_assert(DeviceOptions::kMask == 0xf,
369                 "needs update for new device options");
370 
371   if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
372     *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
373   }
374 
375   // If no flags are set the default is CU_CTX_SCHED_AUTO, which
376   // in Google environments is very likely to mean SPIN.
377   if (device_options.flags() & DeviceOptions::kScheduleSpin) {
378     *flags |= CU_CTX_SCHED_SPIN;
379   }
380   if (device_options.flags() & DeviceOptions::kScheduleYield) {
381     *flags |= CU_CTX_SCHED_YIELD;
382   }
383   if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
384     *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
385   }
386 
387   return true;
388 }
389 
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)390 /* static */ port::Status GpuDriver::CreateContext(
391     int device_ordinal, CUdevice device, const DeviceOptions& device_options,
392     GpuContext** context) {
393   *context = nullptr;
394 
395   int flags = 0;
396   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
397     LOG(WARNING) << "could not convert all device options into context flags";
398   }
399 
400   CUresult res;
401   CUcontext former_context;
402   CUcontext new_context;
403 
404   unsigned int former_primary_context_flags;
405   int former_primary_context_is_active;
406   CHECK_EQ(CUDA_SUCCESS,
407            cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
408                                       &former_primary_context_is_active));
409   if (former_primary_context_flags != flags) {
410     if (former_primary_context_is_active) {
411       LOG(ERROR)
412           << "The primary context is active and has a different flag set ("
413           << former_primary_context_flags << ") than the desired flag set ("
414           << flags << ").";
415     } else {
416       CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
417     }
418   }
419 
420   former_context = cuda::CurrentContextOrDie();
421   res = cuDevicePrimaryCtxRetain(&new_context, device);
422   if (former_context != nullptr) {
423     CUdevice former_device;
424     if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
425       if (former_device == device) {
426         if (former_context == new_context) {
427           VLOG(2) << "The primary context " << former_context << " for device "
428                   << device
429                   << " exists before initializing the StreamExecutor.";
430         } else {
431           LOG(WARNING) << "A non-primary context " << former_context
432                        << " for device " << device
433                        << " exists before initializing the StreamExecutor. The "
434                        << "primary context is now " << new_context << ". We "
435                        << "haven't verified StreamExecutor works with that.";
436         }
437       }
438     } else {
439       LOG(ERROR) << "Failed to get the device of the current context "
440                  << former_context;
441     }
442   }
443   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
444 
445   if (res == CUDA_SUCCESS) {
446     *context = CreatedContexts::Add(new_context);
447     CHECK(*context != nullptr)
448         << "success in this call must entail non-null result";
449     VLOG(2) << "created or reused context " << new_context
450             << " for this thread";
451     return port::Status::OK();
452   }
453 
454   std::string message =
455       "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
456   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
457     uint64 total_memory;
458     if (GetDeviceTotalMemory(device, &total_memory)) {
459       absl::StrAppend(&message, "; total memory reported: ", total_memory);
460     } else {
461       absl::StrAppend(&message, "; could not query total memory");
462     }
463   }
464 
465   return port::Status(port::error::INTERNAL, message);
466 }
467 
DestroyContext(GpuContext * context)468 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
469   if (context == nullptr) {
470     return;
471   }
472   CUcontext former_context = CurrentContext();
473   CUresult res = cuCtxSetCurrent(context->context());
474   CUdevice device;
475   cuCtxGetDevice(&device);
476   cuCtxSetCurrent(former_context);
477 
478   res = cuDevicePrimaryCtxRelease(device);
479 
480   if (res != CUDA_SUCCESS) {
481     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
482   }
483 
484   CreatedContexts::Remove(context->context());
485 }
486 
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)487 /* static */ port::Status GpuDriver::FuncGetAttribute(
488     CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
489   RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
490                            "Failed to query kernel attribute: ", attribute);
491   return port::Status::OK();
492 }
493 
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)494 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
495     CUfunction function, CUfunc_cache cache_config) {
496   RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
497                            "Failed to set CUDA kernel cache config");
498   return port::Status::OK();
499 }
500 
501 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)502 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
503   CUsharedconfig shared_mem_config;
504   ScopedActivateContext activation(context);
505   RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
506                            "Failed to get shared memory config");
507   return shared_mem_config;
508 }
509 
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)510 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
511     GpuContext* context, CUsharedconfig shared_mem_config) {
512   ScopedActivateContext activation(context);
513   RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
514                            "Failed to set shared memory config");
515   return port::Status::OK();
516 }
517 
LaunchKernel(GpuContext * context,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)518 /* static */ port::Status GpuDriver::LaunchKernel(
519     GpuContext* context, CUfunction function, unsigned int grid_dim_x,
520     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
521     unsigned int block_dim_y, unsigned int block_dim_z,
522     unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
523     void** extra) {
524   ScopedActivateContext activation(context);
525   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
526           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
527           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
528           << " bdz: " << block_dim_z;
529   RETURN_IF_CUDA_RES_ERROR(
530       cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
531                      block_dim_y, block_dim_z, shared_mem_bytes, stream,
532                      kernel_params, extra),
533       "Failed to launch CUDA kernel");
534   return port::Status::OK();
535 }
536 
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)537 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
538                                                const char* cubin_bytes,
539                                                CUmodule* module) {
540   ScopedActivateContext activation(context);
541   RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
542                            "Failed to load in-memory CUBIN");
543   return port::Status::OK();
544 }
545 
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)546 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
547                                              const char* ptx_contents,
548                                              CUmodule* module) {
549   absl::Notification notification;
550   port::Status ret = port::Status::OK();
551   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
552                                  &notification]() {
553     ScopedActivateContext activation(context);
554     void* ptx_data = const_cast<char*>(ptx_contents);
555     static const unsigned int kLogBufferBytesLimit = 1024;
556     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
557     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
558     absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
559     absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
560     bool log_verbose = true;
561     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
562                               CU_JIT_ERROR_LOG_BUFFER,
563                               CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
564                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
565     // Note that the driver API wants the contents of this values to be stored
566     // in an array of void*s, so we coerce them accordingly.
567     void* option_values[] = {
568         absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
569         absl::bit_cast<void*>(error_log_buffer.data()),
570         absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
571         absl::bit_cast<void*>(info_log_buffer.data()),
572         absl::bit_cast<void*>(uintptr_t(log_verbose))};
573     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
574 
575     CUresult res;
576     {
577       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
578       // module loading: see http://b/13248943
579       absl::LeakCheckDisabler disabler;
580       res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
581                                option_values);
582     }
583 
584     // The PTX JIT mutates the values in the option values array to reflect the
585     // size of the logs it output; now that we've made the call, read the values
586     // back out.
587     error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
588     info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
589     CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
590     CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
591 
592     if (res != CUDA_SUCCESS) {
593       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
594       // As a precaution for null termination of the API-provided value, ensure
595       // that at least the last byte is null.
596       error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
597                                               : 0] = '\0';
598       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
599                  << " bytes): " << error_log_buffer.data();
600       ret = port::InternalError(
601           absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
602       notification.Notify();
603     }
604 
605     VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
606             << " bytes): " << info_log_buffer.data();
607     VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
608             << " bytes): " << error_log_buffer.data();
609     CHECK(module != nullptr);
610     notification.Notify();
611   });
612   notification.WaitForNotification();
613 
614   return ret;
615 }
616 
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)617 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
618                                                const char* hsaco_contents,
619                                                CUmodule* module) {
620   return port::InternalError(
621       "Feature not supported on CUDA platform (LoadHsaco)");
622 }
623 
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)624 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
625     GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
626   ScopedActivateContext activation(context);
627   RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
628                            "Failed to memset memory");
629   return port::Status::OK();
630 }
631 
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)632 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
633     GpuContext* context, CUdeviceptr location, uint32 value,
634     size_t uint32_count) {
635   ScopedActivateContext activation(context);
636   RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
637                            "Failed to memset memory");
638   return port::Status::OK();
639 }
640 
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)641 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
642     GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
643     CUstream stream) {
644   ScopedActivateContext activation(context);
645   RETURN_IF_CUDA_RES_ERROR(
646       cuMemsetD8Async(location, value, uint32_count, stream),
647       "Failed to enqueue async memset operation");
648   return port::Status::OK();
649 }
650 
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)651 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
652     GpuContext* context, CUdeviceptr location, uint32 value,
653     size_t uint32_count, CUstream stream) {
654   ScopedActivateContext activation(context);
655   RETURN_IF_CUDA_RES_ERROR(
656       cuMemsetD32Async(location, value, uint32_count, stream),
657       "Failed to enqueue async memset operation");
658   return port::Status::OK();
659 }
660 
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)661 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
662                                                CUstream stream,
663                                                StreamCallback callback,
664                                                void* data) {
665   // Note: flags param is required to be zero according to CUDA 6.0.
666   CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
667   if (res != CUDA_SUCCESS) {
668     LOG(ERROR) << "unable to add host callback: " << ToString(res);
669     return false;
670   }
671   return true;
672 }
673 
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)674 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
675                                                CUmodule module,
676                                                const char* kernel_name,
677                                                CUfunction* function) {
678   ScopedActivateContext activated{context};
679   CHECK(module != nullptr && kernel_name != nullptr);
680   CUresult res = cuModuleGetFunction(function, module, kernel_name);
681   if (res != CUDA_SUCCESS) {
682     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
683                << "\" from module: " << ToString(res);
684     return false;
685   }
686 
687   return true;
688 }
689 
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)690 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
691                                              CUmodule module,
692                                              const char* symbol_name,
693                                              CUdeviceptr* dptr, size_t* bytes) {
694   ScopedActivateContext activated{context};
695   CHECK(module != nullptr && symbol_name != nullptr &&
696         (dptr != nullptr || bytes != nullptr));
697   CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
698   if (res != CUDA_SUCCESS) {
699     // symbol may not be found in the current module, but it may reside in
700     // another module.
701     VLOG(2) << "failed to get symbol \"" << symbol_name
702             << "\" from module: " << ToString(res);
703     return false;
704   }
705 
706   return true;
707 }
708 
UnloadModule(GpuContext * context,CUmodule module)709 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
710                                           CUmodule module) {
711   ScopedActivateContext activated{context};
712   CUresult res = cuModuleUnload(module);
713   if (res != CUDA_SUCCESS) {
714     LOG(ERROR) << "failed to unload module " << module
715                << "; leaking: " << ToString(res);
716   }
717 }
718 
DeviceFromContext(GpuContext * context)719 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
720     GpuContext* context) {
721   ScopedActivateContext activated{context};
722   CUdevice device = -1;
723   CUresult result = cuCtxGetDevice(&device);
724   if (result == CUDA_SUCCESS) {
725     return device;
726   }
727 
728   return port::Status(
729       port::error::INTERNAL,
730       absl::StrCat("failed to get device for context: ", ToString(result)));
731 }
732 
CreateStream(GpuContext * context,CUstream * stream,int priority)733 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
734                                           int priority) {
735   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
736   // up synchronization with respect to memsets and any other things that have
737   // to occur on the default stream?
738   ScopedActivateContext activated{context};
739   CUresult res;
740   // If the priority is 0, then use the previous api to create the stream with
741   // the default priority for backward compatibility. Probably there is no
742   // difference in using the new api call but leaving it as is for now.
743   if (priority == 0) {
744     res = cuStreamCreate(stream, 0);
745   } else {
746     res = cuStreamCreateWithPriority(stream, 0, priority);
747   }
748   if (res != CUDA_SUCCESS) {
749     LOG(ERROR) << "could not allocate CUDA stream for context "
750                << context->context() << ": " << ToString(res);
751     return false;
752   }
753 
754   VLOG(2) << "successfully created stream " << *stream << " for context "
755           << context->context() << " on thread";
756   return true;
757 }
758 
DestroyStream(GpuContext * context,CUstream * stream)759 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
760                                            CUstream* stream) {
761   if (*stream == nullptr) {
762     return;
763   }
764 
765   ScopedActivateContext activated{context};
766   CUresult res = cuStreamDestroy(*stream);
767   if (res != CUDA_SUCCESS) {
768     LOG(ERROR) << "failed to destroy CUDA stream for context "
769                << context->context() << ": " << ToString(res);
770   } else {
771     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
772             << context->context();
773     *stream = nullptr;
774   }
775 }
776 
DeviceAllocate(GpuContext * context,uint64 bytes)777 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
778                                              uint64 bytes) {
779   if (bytes == 0) {
780     return nullptr;
781   }
782 
783   ScopedActivateContext activated{context};
784   CUdeviceptr result = 0;
785   CUresult res = cuMemAlloc(&result, bytes);
786   if (res != CUDA_SUCCESS) {
787     // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
788     // implements a retry if the first allocation fails).
789     LOG(INFO) << "failed to allocate "
790               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
791               << " bytes) from device: " << ToString(res);
792     return nullptr;
793   }
794   void* ptr = reinterpret_cast<void*>(result);
795   VLOG(2) << "allocated " << ptr << " for context " << context->context()
796           << " of " << bytes << " bytes";
797   return ptr;
798 }
799 
DeviceDeallocate(GpuContext * context,void * location)800 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
801                                               void* location) {
802   ScopedActivateContext activation(context);
803   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
804   CUresult res = cuMemFree(pointer);
805   if (res != CUDA_SUCCESS) {
806     LOG(ERROR) << "failed to free device memory at " << location
807                << "; result: " << ToString(res);
808   } else {
809     VLOG(2) << "deallocated " << location << " for context "
810             << context->context();
811   }
812 }
813 
UnifiedMemoryAllocate(GpuContext * context,uint64 bytes)814 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
815                                                     uint64 bytes) {
816   ScopedActivateContext activation(context);
817   CUdeviceptr result = 0;
818   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
819   CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
820   if (res != CUDA_SUCCESS) {
821     LOG(ERROR) << "failed to alloc " << bytes
822                << " bytes unified memory; result: " << ToString(res);
823     return nullptr;
824   }
825   void* ptr = reinterpret_cast<void*>(result);
826   VLOG(2) << "allocated " << ptr << " for context " << context->context()
827           << " of " << bytes << " bytes in unified memory";
828   return ptr;
829 }
830 
UnifiedMemoryDeallocate(GpuContext * context,void * location)831 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
832                                                      void* location) {
833   ScopedActivateContext activation(context);
834   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
835   CUresult res = cuMemFree(pointer);
836   if (res != CUDA_SUCCESS) {
837     LOG(ERROR) << "failed to free unified memory at " << location
838                << "; result: " << ToString(res);
839   } else {
840     VLOG(2) << "deallocated unified memory at " << location << " for context "
841             << context->context();
842   }
843 }
844 
HostAllocate(GpuContext * context,uint64 bytes)845 /* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
846   ScopedActivateContext activation(context);
847   void* host_mem = nullptr;
848   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
849   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
850   if (res != CUDA_SUCCESS) {
851     LOG(ERROR) << "failed to alloc " << bytes
852                << " bytes on host: " << ToString(res);
853   }
854   return host_mem;
855 }
856 
HostDeallocate(GpuContext * context,void * location)857 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
858                                             void* location) {
859   ScopedActivateContext activation(context);
860   CUresult res = cuMemFreeHost(location);
861   if (res != CUDA_SUCCESS) {
862     LOG(ERROR) << "error deallocating host memory at " << location << ": "
863                << ToString(res);
864   }
865 }
866 
HostRegister(GpuContext * context,void * location,uint64 bytes)867 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
868                                           uint64 bytes) {
869   ScopedActivateContext activation(context);
870   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
871   CUresult res =
872       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
873   if (res != CUDA_SUCCESS) {
874     LOG(ERROR) << "error registering host memory at " << location << ": "
875                << ToString(res);
876     return false;
877   }
878   return true;
879 }
880 
HostUnregister(GpuContext * context,void * location)881 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
882                                             void* location) {
883   ScopedActivateContext activation(context);
884   CUresult res = cuMemHostUnregister(location);
885   if (res != CUDA_SUCCESS) {
886     LOG(ERROR) << "error unregistering host memory at " << location << ": "
887                << ToString(res);
888     return false;
889   }
890   return true;
891 }
892 
893 #if CUDA_VERSION >= 10020
894 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64 bytes)895 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
896   ScopedActivateContext activation(context);
897   CUdeviceptr base;
898   CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
899                                      /*addr=*/0, /*flags=*/0);
900   if (res != CUDA_SUCCESS) {
901     return port::InternalError(
902         absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
903                         bytes, ToString(res)));
904   }
905   return {{base, bytes}};
906 }
907 
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)908 /* static */ void GpuDriver::FreeVirtualMemory(
909     GpuContext* context, GpuDriver::VmemSpan reservation) {
910   ScopedActivateContext activation(context);
911   CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
912   if (res != CUDA_SUCCESS) {
913     LOG(ERROR) << "error freeing vmem reservation of size "
914                << reservation.size_bytes << " at address " << reservation.base;
915   }
916 }
917 
GetMinAllocationGranularity(GpuDeviceHandle device)918 /* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
919     GpuDeviceHandle device) {
920   CUmemAllocationProp props = {};
921   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
922   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
923   props.location.id = device;
924 
925   size_t granularity;
926   CUresult res = cuMemGetAllocationGranularity(
927       &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
928   if (res != CUDA_SUCCESS) {
929     return port::InternalError(absl::StrCat(
930         "failed to get min allocation granularity: ", ToString(res)));
931   }
932   return granularity;
933 }
934 
935 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64 bytes)936 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
937   ScopedActivateContext activation(context);
938   auto device = DeviceFromContext(context);
939   if (!device.ok()) {
940     LOG(ERROR) << "Failed to get device from context" << device.status();
941     return device.status();
942   }
943 
944   CUmemAllocationProp props = {};
945   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
946   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
947   props.location.id = device.ValueOrDie();
948 
949   CUmemGenericAllocationHandle mem_handle;
950   CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
951   if (res != CUDA_SUCCESS) {
952     return port::InternalError(
953         absl::StrFormat("failed to create memory allocation of size %d: %s",
954                         bytes, ToString(res)));
955   }
956   return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
957 }
958 
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)959 /* static */ void GpuDriver::ReleaseMemoryHandle(
960     GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
961   ScopedActivateContext activation(context);
962 
963   CUresult res = cuMemRelease(handle.handle);
964   if (res != CUDA_SUCCESS) {
965     LOG(ERROR) << "Failed to release memory handle " << handle.handle
966                << " of size " << handle.bytes << ": " << ToString(res);
967   }
968 }
969 
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)970 /* static */ port::Status GpuDriver::MapMemory(
971     GpuContext* context, CUdeviceptr va,
972     const GpuDriver::GenericMemoryHandle& handle,
973     const std::vector<GpuDeviceHandle>& device_handles) {
974   ScopedActivateContext activation(context);
975 
976   auto device = DeviceFromContext(context);
977   if (!device.ok()) {
978     return device.status();
979   }
980 
981   // NB: Zero is the only valid value for both flags and offset.
982   CUresult res =
983       cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
984   if (res != CUDA_SUCCESS) {
985     return port::InternalError(absl::StrFormat(
986         "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
987   }
988 
989   std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
990   for (int i = 0; i < access_descriptors.size(); ++i) {
991     access_descriptors[i].location.id = device_handles[i];
992     access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
993     access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
994   }
995 
996   res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
997                        access_descriptors.size());
998   if (res != CUDA_SUCCESS) {
999     // Unmap the memory that we failed to set access for.
1000     if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
1001       LOG(ERROR)
1002           << "Failed to unmap memory in GpuDriver::MapMemory error path.";
1003     }
1004     return port::InternalError(absl::StrFormat(
1005         "Failed to set read/write access on memory mapped at %d: %s", va,
1006         ToString(res)));
1007   }
1008   return port::Status::OK();
1009 }
1010 
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64 bytes)1011 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
1012                                          uint64 bytes) {
1013   ScopedActivateContext activation(context);
1014 
1015   CUresult res = cuMemUnmap(va, bytes);
1016   if (res != CUDA_SUCCESS) {
1017     LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
1018                << ": " << ToString(res);
1019   }
1020 }
1021 
1022 #endif
1023 
DestroyEvent(GpuContext * context,CUevent * event)1024 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
1025                                                   CUevent* event) {
1026   if (*event == nullptr) {
1027     return port::Status(port::error::INVALID_ARGUMENT,
1028                         "input event cannot be null");
1029   }
1030 
1031   ScopedActivateContext activated{context};
1032   RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
1033                            "Error destroying CUDA event");
1034   return port::Status::OK();
1035 }
1036 
RecordEvent(GpuContext * context,CUevent event,CUstream stream)1037 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
1038                                                  CUevent event,
1039                                                  CUstream stream) {
1040   ScopedActivateContext activated{context};
1041   RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
1042                            "Error recording CUDA event");
1043   return port::Status::OK();
1044 }
1045 
QueryEvent(GpuContext * context,CUevent event)1046 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
1047                                                             CUevent event) {
1048   ScopedActivateContext activated{context};
1049   CUresult res = cuEventQuery(event);
1050   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
1051     return port::Status(
1052         port::error::INTERNAL,
1053         absl::StrFormat("failed to query event: %s", ToString(res)));
1054   }
1055 
1056   return res;
1057 }
1058 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1059 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1060                                                  float* elapsed_milliseconds,
1061                                                  CUevent start, CUevent stop) {
1062   ScopedActivateContext activated{context};
1063   // The stop event must have completed in order for cuEventElapsedTime to
1064   // work.
1065   CUresult res = cuEventSynchronize(stop);
1066   if (res != CUDA_SUCCESS) {
1067     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1068     return false;
1069   }
1070   res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1071   if (res != CUDA_SUCCESS) {
1072     LOG(ERROR) << "failed to get elapsed time between events: "
1073                << ToString(res);
1074     return false;
1075   }
1076 
1077   return true;
1078 }
1079 
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1080 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1081                                                CUstream stream, CUevent event) {
1082   ScopedActivateContext activation(context);
1083   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1084   if (res != CUDA_SUCCESS) {
1085     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1086     return false;
1087   }
1088 
1089   return true;
1090 }
1091 
SynchronizeContext(GpuContext * context)1092 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1093   ScopedActivateContext activation(context);
1094   CUresult res = cuCtxSynchronize();
1095   if (res != CUDA_SUCCESS) {
1096     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1097                << " :: " << port::CurrentStackTrace();
1098     return false;
1099   }
1100 
1101   return true;
1102 }
1103 
SynchronizeStream(GpuContext * context,CUstream stream)1104 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1105                                                        CUstream stream) {
1106   ScopedActivateContext activated{context};
1107   CHECK(stream != nullptr);
1108   RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1109                            "Could not synchronize CUDA stream");
1110   return port::Status::OK();
1111 }
1112 
IsStreamIdle(GpuContext * context,CUstream stream)1113 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1114                                           CUstream stream) {
1115   ScopedActivateContext activated{context};
1116   CHECK(stream != nullptr);
1117   CUresult res = cuStreamQuery(stream);
1118   if (res == CUDA_SUCCESS) {
1119     return true;
1120   }
1121 
1122   if (res != CUDA_ERROR_NOT_READY) {
1123     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1124   }
1125   return false;
1126 }
1127 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size)1128 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1129                                                           void* host_dst,
1130                                                           CUdeviceptr gpu_src,
1131                                                           uint64 size) {
1132   ScopedActivateContext activation(context);
1133   RETURN_IF_CUDA_RES_ERROR(
1134       cuMemcpyDtoH(host_dst, gpu_src, size),
1135       absl::StrFormat("failed to synchronous memcpy from device to host "
1136                       "host dst: %p; GPU src: %p; size: %u=0x%x",
1137                       host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1138   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1139           << host_dst;
1140   return port::Status::OK();
1141 }
1142 
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size)1143 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1144                                                           CUdeviceptr gpu_dst,
1145                                                           const void* host_src,
1146                                                           uint64 size) {
1147   ScopedActivateContext activation(context);
1148   RETURN_IF_CUDA_RES_ERROR(
1149       cuMemcpyHtoD(gpu_dst, host_src, size),
1150       absl::StrFormat(
1151           "failed to synchronous memcpy from host to device: GPU dst: %p;"
1152           " host src: %p; size: %u=0x%x",
1153           absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1154   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1155   return port::Status::OK();
1156 }
1157 
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size)1158 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1159                                                           CUdeviceptr gpu_dst,
1160                                                           CUdeviceptr gpu_src,
1161                                                           uint64 size) {
1162   ScopedActivateContext activation(context);
1163   RETURN_IF_CUDA_RES_ERROR(
1164       cuMemcpyDtoD(gpu_dst, gpu_src, size),
1165       absl::StrFormat(
1166           "failed to synchronous memcpy from host to device: GPU dst: %p; "
1167           "GPU src: %p; size: %u=0x%x",
1168           absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1169           size));
1170   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1171   return port::Status::OK();
1172 }
1173 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1174 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1175                                                    void* host_dst,
1176                                                    CUdeviceptr gpu_src,
1177                                                    uint64 size,
1178                                                    CUstream stream) {
1179   ScopedActivateContext activation(context);
1180   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1181   if (res != CUDA_SUCCESS) {
1182     LOG(ERROR) << absl::StrFormat(
1183         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1184         "GPU src: %p; size: %u=0x%x",
1185         ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1186     return false;
1187   }
1188   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1189           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1190           << host_dst << " on stream " << stream;
1191   return true;
1192 }
1193 
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64 size,CUstream stream)1194 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1195                                                    CUdeviceptr gpu_dst,
1196                                                    const void* host_src,
1197                                                    uint64 size,
1198                                                    CUstream stream) {
1199   ScopedActivateContext activation(context);
1200   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1201   if (res != CUDA_SUCCESS) {
1202     LOG(ERROR) << absl::StrFormat(
1203         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1204         "host src: %p; size: %u=0x%x",
1205         ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1206     return false;
1207   }
1208   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1209           << " on stream " << stream;
1210   return true;
1211 }
1212 
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64 size,CUstream stream)1213 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1214                                                    CUdeviceptr gpu_dst,
1215                                                    CUdeviceptr gpu_src,
1216                                                    uint64 size,
1217                                                    CUstream stream) {
1218   ScopedActivateContext activation(context);
1219   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1220   if (result != CUDA_SUCCESS) {
1221     LOG(ERROR) << absl::StrFormat(
1222         "failed to enqueue async memcpy from device to device: %s"
1223         "; GPU dst: %p on %s %s"
1224         "; GPU src: %p on %s %s"
1225         "; can access? %s; size: %u=0x%x",
1226         ToString(result), absl::bit_cast<void*>(gpu_dst),
1227         CUDAPointerToMemorySpaceString(gpu_dst),
1228         CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1229         CUDAPointerToMemorySpaceString(gpu_src),
1230         CUDAPointerToDeviceString(gpu_src),
1231         CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1232 
1233     return false;
1234   }
1235   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1236   return true;
1237 }
1238 
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1239 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1240                                                CUevent* result,
1241                                                EventFlags flags) {
1242   int cuflags;
1243   switch (flags) {
1244     case EventFlags::kDefault:
1245       cuflags = CU_EVENT_DEFAULT;
1246       break;
1247     case EventFlags::kDisableTiming:
1248       cuflags = CU_EVENT_DISABLE_TIMING;
1249       break;
1250     default:
1251       LOG(FATAL) << "impossible event flags: " << int(flags);
1252   }
1253 
1254   ScopedActivateContext activated{context};
1255   CUresult res = cuEventCreate(result, cuflags);
1256 
1257   if (res == CUDA_SUCCESS) {
1258     return port::Status::OK();
1259   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1260     return port::Status(port::error::RESOURCE_EXHAUSTED,
1261                         "could not create CUDA event: out of device memory");
1262   } else {
1263     return port::Status(
1264         port::error::FAILED_PRECONDITION,
1265         absl::StrCat("could not create CUDA event: ", ToString(res)));
1266   }
1267 }
1268 
GetDeviceCount()1269 /* static */ int GpuDriver::GetDeviceCount() {
1270   int device_count = 0;
1271   CUresult res = cuDeviceGetCount(&device_count);
1272   if (res != CUDA_SUCCESS) {
1273     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1274     return 0;
1275   }
1276 
1277   if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1278     device_count = 1;
1279   }
1280   return device_count;
1281 }
1282 
GetPointerContext(CUdeviceptr pointer)1283 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1284     CUdeviceptr pointer) {
1285   GpuContext* context = nullptr;
1286   CUresult result =
1287       cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1288   if (result == CUDA_SUCCESS) {
1289     CHECK(context != nullptr) << "success should entail non-null context";
1290     return context;
1291   }
1292 
1293   return port::Status(
1294       port::error::INTERNAL,
1295       absl::StrCat("failed to query device pointer for context: ",
1296                    ToString(result)));
1297 }
1298 
GetPointerMemorySpace(CUdeviceptr pointer)1299 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1300     CUdeviceptr pointer) {
1301   unsigned int value;
1302   CUresult result =
1303       cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1304   if (result == CUDA_SUCCESS) {
1305     switch (value) {
1306       case CU_MEMORYTYPE_DEVICE:
1307         return MemorySpace::kDevice;
1308       case CU_MEMORYTYPE_HOST:
1309         return MemorySpace::kHost;
1310       default:
1311         return port::Status(
1312             port::error::INTERNAL,
1313             absl::StrCat("unknown memory space provided by CUDA API: ", value));
1314     }
1315   }
1316 
1317   return port::Status(
1318       port::error::INTERNAL,
1319       absl::StrCat("failed to query device pointer for memory space: ",
1320                    ToString(result)));
1321 }
1322 
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1323 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1324                                                             CUdeviceptr* base,
1325                                                             size_t* size) {
1326   CUresult result = cuMemGetAddressRange(base, size, dptr);
1327   if (result == CUDA_SUCCESS) {
1328     return port::Status::OK();
1329   } else if (result == CUDA_ERROR_NOT_FOUND) {
1330     // We differentiate between "this pointer is unknown" (return here) and
1331     // "there was an internal error while performing this operation" (return
1332     // below).
1333     return port::Status(
1334         port::error::NOT_FOUND,
1335         absl::StrFormat("not a device pointer %p; %s",
1336                         reinterpret_cast<void*>(dptr), ToString(result)));
1337   }
1338 
1339   return port::Status(
1340       port::error::INTERNAL,
1341       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1342                       reinterpret_cast<void*>(dptr), ToString(result)));
1343 }
1344 
GetPointerDevice(CUdeviceptr pointer)1345 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1346     CUdeviceptr pointer) {
1347   auto result = GetPointerContext(pointer);
1348   if (!result.ok()) {
1349     return result.status();
1350   }
1351 
1352   return DeviceFromContext(result.ValueOrDie());
1353 }
1354 
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1355 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1356                                                           int* cc_minor,
1357                                                           CUdevice device) {
1358   *cc_major = 0;
1359   *cc_minor = 0;
1360 
1361   CUresult res = cuDeviceGetAttribute(
1362       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1363   if (res != CUDA_SUCCESS) {
1364     return port::Status(
1365         port::error::INTERNAL,
1366         absl::StrFormat(
1367             "failed to get compute capability major for device: %s; %d",
1368             ToString(res), device));
1369   }
1370 
1371   res = cuDeviceGetAttribute(
1372       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1373   if (res != CUDA_SUCCESS) {
1374     return port::Status(
1375         port::error::INTERNAL,
1376         absl::StrFormat(
1377             "failed to get compute capability minor for device: %s; %d",
1378             ToString(res), device));
1379   }
1380 
1381   return port::Status::OK();
1382 }
1383 
GetGpuISAVersion(int * version,CUdevice device)1384 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1385                                                       CUdevice device) {
1386   return port::Status{
1387       port::error::INTERNAL,
1388       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1389 }
1390 
GetGpuGCNArchName(CUdevice,std::string *)1391 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1392   return port::Status{
1393       port::error::INTERNAL,
1394       "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1395 }
1396 
1397 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1398 // T and wraps it in a StatusOr.
1399 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1400 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1401                                             CUdevice_attribute attribute) {
1402   int value = -1;
1403   RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1404                            "Could not retrieve CUDA device attribute (",
1405                            attribute);
1406   T converted = value;
1407   return converted;
1408 }
1409 
GetMultiprocessorCount(CUdevice device)1410 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1411     CUdevice device) {
1412   return GetSimpleAttribute<int>(device,
1413                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1414 }
1415 
GetMaxSharedMemoryPerCore(CUdevice device)1416 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
1417     CUdevice device) {
1418   return GetSimpleAttribute<int64>(
1419       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1420 }
1421 
GetMaxSharedMemoryPerBlock(CUdevice device)1422 /* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
1423     CUdevice device) {
1424   return GetSimpleAttribute<int64>(
1425       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1426 }
1427 
GetMaxThreadsPerMultiprocessor(CUdevice device)1428 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
1429     CUdevice device) {
1430   return GetSimpleAttribute<int64>(
1431       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1432 }
1433 
GetMaxThreadsPerBlock(CUdevice device)1434 /* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
1435     CUdevice device) {
1436   return GetSimpleAttribute<int64>(device,
1437                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1438 }
1439 
GetMaxRegistersPerBlock(CUdevice device)1440 /* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
1441     CUdevice device) {
1442   return GetSimpleAttribute<int64>(device,
1443                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1444 }
1445 
GetThreadsPerWarp(CUdevice device)1446 /* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
1447     CUdevice device) {
1448   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1449 }
1450 
GetGridLimits(int * x,int * y,int * z,CUdevice device)1451 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1452                                            CUdevice device) {
1453   int value;
1454   CUresult res =
1455       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1456   if (res != CUDA_SUCCESS) {
1457     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1458     return false;
1459   }
1460   *x = value;
1461 
1462   res =
1463       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1464   if (res != CUDA_SUCCESS) {
1465     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1466     return false;
1467   }
1468   *y = value;
1469 
1470   res =
1471       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1472   if (res != CUDA_SUCCESS) {
1473     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1474     return false;
1475   }
1476   *z = value;
1477   return true;
1478 }
1479 
GetDriverVersion(int * driver_version)1480 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1481   CUresult res = cuDriverGetVersion(driver_version);
1482   if (res != CUDA_SUCCESS) {
1483     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1484     return false;
1485   }
1486 
1487   return true;
1488 }
1489 
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1490 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1491                                                  int device_ordinal) {
1492   CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1493   if (res != CUDA_SUCCESS) {
1494     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1495     return false;
1496   }
1497 
1498   return true;
1499 }
1500 
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1501 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1502     CUdevice_attribute attribute, CUdevice device) {
1503   int val;
1504   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1505   if (res != CUDA_SUCCESS) {
1506     return port::Status(
1507         port::error::INTERNAL,
1508         absl::StrFormat("failed to get device attribute %d for device %d: %s",
1509                         attribute, device, ToString(res)));
1510   }
1511   return val;
1512 }
1513 
IsEccEnabled(CUdevice device,bool * result)1514 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1515   int value = -1;
1516   CUresult res =
1517       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1518   if (res != CUDA_SUCCESS) {
1519     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1520     return false;
1521   }
1522 
1523   *result = value;
1524   return true;
1525 }
1526 
GetDeviceMemoryInfo(GpuContext * context,int64 * free_out,int64 * total_out)1527 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1528                                                  int64* free_out,
1529                                                  int64* total_out) {
1530   ScopedActivateContext activation(context);
1531   size_t free = 0;
1532   size_t total = 0;
1533   CUresult res = cuMemGetInfo(&free, &total);
1534   if (res != CUDA_SUCCESS) {
1535     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1536     return false;
1537   }
1538 
1539   *free_out = free;
1540   *total_out = total;
1541   return true;
1542 }
1543 
GetDeviceTotalMemory(CUdevice device,uint64 * result)1544 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1545                                                   uint64* result) {
1546   size_t value = -1;
1547   CUresult res = cuDeviceTotalMem(&value, device);
1548   if (res != CUDA_SUCCESS) {
1549     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1550     return false;
1551   }
1552 
1553   *result = value;
1554   return true;
1555 }
1556 
GetPCIBusID(CUdevice device)1557 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1558   std::string pci_bus_id;
1559   static const int kBufferSize = 64;
1560   absl::InlinedVector<char, 4> chars(kBufferSize);
1561   chars[kBufferSize - 1] = '\0';
1562   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1563   if (res != CUDA_SUCCESS) {
1564     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1565     return pci_bus_id;
1566   }
1567   pci_bus_id = chars.begin();
1568   return pci_bus_id;
1569 }
1570 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1571 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1572                                                  GpuContext* to) {
1573   if (from == to) {
1574     return true;  // A context can always access its own memory.
1575   }
1576 
1577   auto from_device = DeviceFromContext(from);
1578   if (!from_device.ok()) {
1579     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1580                << from_device.status();
1581     return false;
1582   }
1583   auto to_device = DeviceFromContext(to);
1584   if (!to_device.ok()) {
1585     LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1586                << to_device.status();
1587     return false;
1588   }
1589   return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1590 }
1591 
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1592 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1593                                                  GpuDeviceHandle to) {
1594   int can_access_peer = -1;
1595   CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1596   if (result != CUDA_SUCCESS) {
1597     LOG(ERROR) << "failed to detect peer access capability: "
1598                << ToString(result);
1599     return false;
1600   }
1601   return can_access_peer;
1602 }
1603 
EnablePeerAccess(GpuContext * from,GpuContext * to)1604 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1605                                                       GpuContext* to) {
1606   if (from == to) {
1607     return port::Status::OK();  // A context can always access its own memory.
1608   }
1609 
1610   ScopedActivateContext activated{from};
1611   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1612   if (result != CUDA_SUCCESS &&
1613       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1614     return port::Status(
1615         port::error::INTERNAL,
1616         absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1617                         to, ToString(result)));
1618   }
1619 
1620   return port::Status::OK();
1621 }
1622 
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1623 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1624     GpuContext* context, CUfunction kernel, int threads_per_block,
1625     size_t dynamic_shared_memory_bytes) {
1626   ScopedActivateContext activation(context);
1627 
1628   int max_blocks;
1629   RETURN_IF_CUDA_RES_ERROR(
1630       cuOccupancyMaxActiveBlocksPerMultiprocessor(
1631           &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1632       absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1633   return max_blocks;
1634 }
1635 
1636 }  // namespace gpu
1637 
1638 namespace cuda {
1639 
CurrentContextOrDie()1640 CUcontext CurrentContextOrDie() {
1641   CUcontext current = nullptr;
1642   FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(&current),
1643                          "Failed to query current context");
1644   return current;
1645 }
1646 
1647 }  // namespace cuda
1648 }  // namespace stream_executor
1649