1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // CUDA userspace driver library wrapper functionality.
17 
18 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
19 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
20 
21 #include <stddef.h>
22 #include "tensorflow/stream_executor/platform/port.h"
23 
24 #include "cuda/include/cuda.h"
25 #include "tensorflow/stream_executor/device_options.h"
26 #include "tensorflow/stream_executor/lib/status.h"
27 #include "tensorflow/stream_executor/lib/statusor.h"
28 #include "tensorflow/stream_executor/platform/port.h"
29 
30 #include "tensorflow/stream_executor/gpu/gpu_types.h"
31 
32 namespace stream_executor {
33 namespace gpu {
34 
35 // Identifies the memory space where an allocation resides. See
36 // GpuDriver::GetPointerMemorySpace().
37 enum class MemorySpace { kHost, kDevice };
38 
39 // Returns a casual string, such as "host" for the provided memory space.
40 string MemorySpaceString(MemorySpace memory_space);
41 
42 class GpuContext;
43 
44 // GpuDriver contains wrappers for calls to the userspace library driver. It's
45 // useful to isolate these calls and put basic wrappers around them to separate
46 // userspace library driver behaviors from the rest of the program.
47 //
48 // At the moment it's simply used as a namespace.
49 //
50 // The calls log any specific errors internally and return whether the operation
51 // was successful to the caller.
52 //
53 // The order of parameters is generally kept symmetric with the underlying CUDA
54 // driver API.
55 //
56 // Links on functions are to specific documentation under
57 // http://docs.nvidia.com/cuda/cuda-driver-api/
58 //
59 // Thread safety: these functions should not be used from signal handlers.
60 class GpuDriver {
61  public:
62   // Wraps a call to cuInit with logging to help indicate what has gone wrong in
63   // the case of failure. Safe to call multiple times; will be fast on all calls
64   // after the first.
65   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
66   static port::Status Init();
67 
68   // Returns the device associated with the given context.
69   // device is an outparam owned by the caller, must not be null.
70   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
71   static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
72 
73   // Creates a new CUDA stream associated with the given context via
74   // cuStreamCreate.
75   // stream is an outparam owned by the caller, must not be null.
76   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
77   static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
78 
79   // Destroys a CUDA stream associated with the given context.
80   // stream is owned by the caller, must not be null, and *stream is set to null
81   // if the stream is successfully destroyed.
82   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
83   static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
84 
85   // CUDA events can explicitly disable event TSC retrieval for some presumed
86   // performance improvement if timing is unnecessary.
87   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
88   enum class EventFlags { kDefault, kDisableTiming };
89 
90   // Creates a new event associated with the given context.
91   // result is an outparam owned by the caller and must not be null.
92   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
93   static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
94                                   EventFlags flags);
95 
96   // Destroys *event and turns it into a nullptr. event may not be null, but
97   // *event may be, via cuEventDestroy
98   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
99   static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
100 
101   // Allocates a GPU memory space of size bytes associated with the given
102   // context via cuMemAlloc.
103   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
104   static void* DeviceAllocate(GpuContext* context, uint64 bytes);
105 
106   // Deallocates a GPU memory space of size bytes associated with the given
107   // context via cuMemFree.
108   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
109   static void DeviceDeallocate(GpuContext* context, void* location);
110 
111   // Allocates a unified memory space of size bytes associated with the given
112   // context via cuMemAllocManaged.
113   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
114   // (supported on CUDA only)
115   static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
116 
117   // Deallocates a unified memory space of size bytes associated with the given
118   // context via cuMemFree.
119   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
120   // (supported on CUDA only)
121   static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
122 
123   // Allocates page-locked and CUDA-registered memory on the host via
124   // cuMemAllocHost.
125   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
126   static void* HostAllocate(GpuContext* context, uint64 bytes);
127 
128   // Deallocates a location created by HostAllocate, via cuMemFreeHost.
129   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
130   static void HostDeallocate(GpuContext* context, void* location);
131 
132   // Registers a memory region at location of size bytes via cuMemHostRegister.
133   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
134   static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
135 
136   // Unregisters a memory region that was previously registered at location via
137   // cuMemHostUnregister.
138   //
139   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
140   //
141   // TODO(leary) verify an error will be returned if the location wasn't
142   // previously registered.
143   static bool HostUnregister(GpuContext* context, void* location);
144 
145   // Given a device ordinal, returns a device handle into the device outparam,
146   // which must not be null.
147   //
148   // N.B. these device handles do not have a corresponding destroy function in
149   // the CUDA driver API.
150   static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
151 
152   // Given a device handle, returns the name reported by the driver for the
153   // device.
154   static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
155 
156   // Given a device to create a context for, returns a context handle into the
157   // context outparam, which must not be null.
158   //
159   // N.B. CUDA contexts are weird. They are implicitly associated with the
160   // calling thread. Current documentation on contexts and their influence on
161   // userspace processes is given here:
162   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
163   static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
164                                     const DeviceOptions& device_options,
165                                     GpuContext** context);
166 
167   // Destroys the provided context via cuCtxDestroy.
168   // Don't do this while clients could still be using the context, per the docs
169   // bad things will happen.
170   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
171   static void DestroyContext(GpuContext* context);
172 
173   // Queries the runtime for the specified attribute of the specified function.
174   // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
175   // in terms of integer-sized values, so there's no potential for overrun (as
176   // of CUDA 5.5).
177   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
178   static bool FuncGetAttribute(GpuFunctionAttribute attribute,
179                                GpuFunctionHandle function,
180                                int* attribute_value);
181 
182   // Sets the preferred cache configuration for the specified function.
183   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
184   static bool FuncSetCacheConfig(GpuFunctionHandle function,
185                                  GpuFuncCachePreference cache_config);
186 
187   // Gets the preferred shared memory bank configuration for the specified
188   // CONTEXT (not function!), either default or four- or eight-byte bank size.
189   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
190   static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
191       GpuContext* context);
192 
193   // Sets the preferred shared memory bank configuration for the specified
194   // CONTEXT (not function!), either default or four- or eight-byte bank size.
195   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
196   static port::Status ContextSetSharedMemConfig(
197       GpuContext* context, GpuSharedMemConfig shared_mem_config);
198 
199   // Launches a CUDA kernel via cuLaunchKernel.
200   // TODO(leary) describe the structure of kernel_params and extra in a readable
201   // way.
202   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
203   static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
204                            unsigned int grid_dim_x, unsigned int grid_dim_y,
205                            unsigned int grid_dim_z, unsigned int block_dim_x,
206                            unsigned int block_dim_y, unsigned int block_dim_z,
207                            unsigned int shared_mem_bytes,
208                            GpuStreamHandle stream, void** kernel_params,
209                            void** extra);
210 
211   // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
212   // handle in "module". Any error logs that are produced are logged internally.
213   // (supported on CUDA only)
214   static bool LoadPtx(GpuContext* context, const char* ptx_contents,
215                       GpuModuleHandle* module);
216 
217   // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
218   // the resulting handle in "module".
219   // (supported on CUDA only)
220   static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
221                                 GpuModuleHandle* module);
222 
223   // Loads HSACO with the ROCM runtime and stores the resulting handle in
224   // "module". Any error logs that are produced are logged internally.
225   // (supported on ROCm only)
226   static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
227                         GpuModuleHandle* module);
228 
229   // Retrieves a named kernel from a loaded module, and places the resulting
230   // handle into function (outparam) on success. Neither kernel_name nor
231   // function may be null. No ownership is taken of kernel_name.
232   static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
233                                 const char* kernel_name,
234                                 GpuFunctionHandle* function);
235 
236   // Retrieves a named global/constant symbol from a loaded module, and returns
237   // a device pointer and size of the symbol on success. symbol_name may not be
238   // null. At least one of dptr or bytes should not be null. No ownership is
239   // taken of symbol_name.
240   static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
241                               const char* symbol_name, GpuDevicePtr* dptr,
242                               size_t* bytes);
243 
244   // Unloads module from the current context via cuModuleUnload.
245   // TODO(leary) the documentation doesn't say what kind of disasters happen
246   // if you try to unload a module while its GpuFunctionHandles are in use.
247   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
248   static void UnloadModule(GpuContext* context, GpuModuleHandle module);
249 
250   // Performs a synchronous memset of the device memory segment via cuMemsetD8.
251   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
252   static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
253                                      uint8 value, size_t size);
254 
255   // Performs a synchronous memset of the device memory segment via cuMemsetD32.
256   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
257   static bool SynchronousMemsetUint32(GpuContext* context,
258                                       GpuDevicePtr location, uint32 value,
259                                       size_t uint32_count);
260 
261   // Performs an asynchronous memset of the device memory segment via
262   // cuMemsetD8Async.
263   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
264   static bool AsynchronousMemsetUint8(GpuContext* context,
265                                       GpuDevicePtr location, uint8 value,
266                                       size_t uint32_count,
267                                       GpuStreamHandle stream);
268 
269   // Performs an asynchronous memset of the device memory segment via
270   // cuMemsetD32Async.
271   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
272   static bool AsynchronousMemsetUint32(GpuContext* context,
273                                        GpuDevicePtr location, uint32 value,
274                                        size_t uint32_count,
275                                        GpuStreamHandle stream);
276 
277   // -- Synchronous memcopies.
278   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
279 
280   static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
281                                            GpuDevicePtr gpu_src, uint64 size);
282   static port::Status SynchronousMemcpyH2D(GpuContext* context,
283                                            GpuDevicePtr gpu_dst,
284                                            const void* host_src, uint64 size);
285   static port::Status SynchronousMemcpyD2D(GpuContext* context,
286                                            GpuDevicePtr gpu_dst,
287                                            GpuDevicePtr gpu_src, uint64 size);
288 
289   // -- Asynchronous memcopies.
290   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
291 
292   static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
293                                     GpuDevicePtr gpu_src, uint64 size,
294                                     GpuStreamHandle stream);
295   static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
296                                     const void* host_src, uint64 size,
297                                     GpuStreamHandle stream);
298   static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
299                                     GpuDevicePtr gpu_src, uint64 size,
300                                     GpuStreamHandle stream);
301 
302   // The CUDA stream callback type signature.
303   // The data passed to AddStreamCallback is subsequently passed to this
304   // callback when it fires.
305   //
306   // Some notable things:
307   // * Callbacks must not make any CUDA API calls.
308   // * Callbacks from independent streams execute in an undefined order and may
309   //   be serialized.
310   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
311   typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
312                                  void* data);
313 
314   // Enqueues a callback operation into stream.
315   // See StreamCallback above and the NVIDIA documentation for additional
316   // details.
317   static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
318                                 StreamCallback callback, void* data);
319 
320   // Causes stream to wait for event to trigger before proceeding via
321   // cuStreamWaitEvent.
322   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
323   static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
324                                 GpuEventHandle event);
325 
326   // Blocks the calling thread until the operations enqueued onto stream have
327   // been completed, via cuStreamSynchronize.
328   //
329   // TODO(leary) if a pathological thread enqueues operations onto the stream
330   // while another thread blocks like this, can you wind up waiting an unbounded
331   // amount of time?
332   //
333   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
334   static port::Status SynchronizeStream(GpuContext* context,
335                                         GpuStreamHandle stream);
336 
337   // Blocks the calling thread until the operations associated with the context
338   // have been completed, via cuCtxSynchronize.
339   //
340   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
341   static bool SynchronizeContext(GpuContext* context);
342 
343   // Returns true if all stream tasks have completed at time of the call. Note
344   // the potential for races around this call (if another thread adds work to
345   // the stream immediately after this returns).
346   static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
347 
348   // Returns whether code in the from context can access memory in the to
349   // context via cuDeviceCanAccessPeer.
350   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
351   static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
352 
353   // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
354   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
355   static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
356 
357   // Returns the elapsed milliseconds between start and stop via
358   // cuEventElapsedTime.
359   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
360   static bool GetEventElapsedTime(GpuContext* context,
361                                   float* elapsed_milliseconds,
362                                   GpuEventHandle start, GpuEventHandle stop);
363 
364   // Records that an event occurred when execution reaches the current point in
365   // thestream via cuEventRecord.
366   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
367   static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
368                                   GpuStreamHandle stream);
369 
370   // Polls (without blocking) to determine the status of an event - pending or
371   // complete (or an error status).
372   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
373   static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
374                                               GpuEventHandle event);
375 
376   // -- Pointer-specific calls.
377 
378   // Returns the context in which pointer was allocated or registered.
379   static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
380 
381   // Returns the device associated with the context from GetPointerContext().
382   static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
383 
384   // Returns the memory space addressed by pointer.
385   static port::StatusOr<MemorySpace> GetPointerMemorySpace(
386       GpuDevicePtr pointer);
387 
388   // Returns the base address and size of the device pointer dptr.
389   static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
390                                              GpuDevicePtr* base, size_t* size);
391 
392   // -- Device-specific calls.
393 
394   // Returns the compute capability for the device; i.e (3, 5).
395   // This is currently done via the deprecated device API.
396   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
397   // (supported on CUDA only)
398   static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
399                                            GpuDeviceHandle device);
400 
401   // Returns Gpu ISA version for the device; i.e 803, 900.
402   // (supported on ROCm only)
403   static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
404 
405   // Returns the number of multiprocessors on the device (note that the device
406   // may be multi-GPU-per-board).
407   static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
408 
409   // Returns the limit on number of threads that can be resident in a single
410   // multiprocessor.
411   static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
412       GpuDeviceHandle device);
413 
414   // Returns the limit on number of threads which may be resident for a single
415   // block (cooperative thread array).
416   static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
417 
418   // Returns the amount of shared memory available on a single GPU core (i.e.
419   // SM on NVIDIA devices).
420   static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
421       GpuDeviceHandle device);
422 
423   // Returns the amount of shared memory available for a single block
424   // (cooperative thread array).
425   static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
426       GpuDeviceHandle device);
427 
428   // Returns the maximum supported number of registers per block.
429   static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
430 
431   // Returns the number of threads per warp.
432   static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
433 
434   // Queries the grid limits for device with cuDeviceGetAttribute calls.
435   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
436   static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
437 
438   // Returns a grab-bag of device properties in a caller-owned device_properties
439   // structure for device_ordinal via cuDeviceGetProperties.
440   //
441   // This call is deprecated in the NVIDIA driver API; its replacement is
442   // GetDeviceAttribute
443   //
444   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
445   static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
446                                   int device_ordinal);
447 
448   // Gets a specific integer-valued property about the given device.
449   //
450   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
451   static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
452                                                 GpuDeviceHandle device);
453 
454   // Returns whether ECC is enabled for the given GpuDeviceHandle via
455   // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
456   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
457   static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
458 
459   // Returns the total amount of memory available for allocation by the CUDA
460   // context, in bytes, via cuDeviceTotalMem.
461   static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
462 
463   // Returns the free amount of memory and total amount of memory, as reported
464   // by cuMemGetInfo.
465   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
466   static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
467                                   int64* total);
468 
469   // Returns a PCI bus id string for the device.
470   // [domain]:[bus]:[device].[function]
471   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
472   static string GetPCIBusID(GpuDeviceHandle device);
473 
474   // -- Context- and device-independent calls.
475 
476   // Returns the number of visible CUDA device via cuDeviceGetCount.
477   // This should correspond to the set of device ordinals available.
478   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
479   static int GetDeviceCount();
480 
481   // Returns the driver version number via cuDriverGetVersion.
482   // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
483   // instead, the CUDA toolkit release number that this driver is compatible
484   // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
485   // compatible driver).
486   //
487   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
488   static bool GetDriverVersion(int* driver_version);
489 
490   // -- Other calls
491 
492   // Returns the maximum number of blocks (per multiprocessor) occupied by the
493   // specified kernel/GpuFunctionHandle when launched with the specified
494   // parameters.
495   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
496   static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
497       GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
498       size_t dynamic_shared_memory_bytes);
499 
500   // Seam for injecting an error at CUDA initialization time for testing
501   // purposes.
502   static bool driver_inject_init_error_;
503 };
504 
505 // Ensures a context is activated within a scope.
506 class ScopedActivateContext {
507  public:
508   // Activates the context via cuCtxSetCurrent, if it is not the currently
509   // active context (a la cuCtxGetCurrent). Note the alternative push/pop
510   // mechanism is said by NVIDIA to be relatively slow and deprecated.
511   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
512   explicit ScopedActivateContext(GpuContext* context);
513 
514   // Checks that the context has remained activated for the duration of the
515   // scope.
516   ~ScopedActivateContext();
517 
518  private:
519   GpuContext* to_restore_ = nullptr;
520 };
521 
522 }  // namespace gpu
523 }  // namespace stream_executor
524 
525 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
526