1 /*
2  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #ifndef __cuda_cuda_h__
13 #define __cuda_cuda_h__
14 
15 #include <stdlib.h>
16 
17 #ifndef __CUDA_API_VERSION
18 #define __CUDA_API_VERSION 4000
19 #endif
20 
21 /**
22  * \defgroup CUDA_DRIVER CUDA Driver API
23  *
24  * This section describes the low-level CUDA driver application programming
25  * interface.
26  *
27  * @{
28  */
29 
30 /**
31  * \defgroup CUDA_TYPES Data types used by CUDA driver
32  * @{
33  */
34 
35 /**
36  * CUDA API version number
37  */
38 #define CUDA_VERSION 4000 /* 4.0 */
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 /**
45  * CUDA device pointer
46  */
47 #if __CUDA_API_VERSION >= 3020
48 
49 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
50     typedef unsigned long long CUdeviceptr;
51 #else
52     typedef unsigned int CUdeviceptr;
53 #endif
54 
55 #endif /* __CUDA_API_VERSION >= 3020 */
56 
57 typedef int CUdevice;                                     /**< CUDA device */
58 typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
59 typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
60 typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
61 typedef struct CUarray_st *CUarray;                       /**< CUDA array */
62 typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
63 typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
64 typedef struct CUevent_st *CUevent;                       /**< CUDA event */
65 typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
66 typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
67 
68 typedef struct CUuuid_st                                  /**< CUDA definition of UUID */
69 {
70     char bytes[16];
71 } CUuuid;
72 
73 /**
74  * Context creation flags
75  */
76 typedef enum CUctx_flags_enum
77 {
78     CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
79     CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
80     CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
81     CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
82     CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
83     CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
84     CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
85 #if __CUDA_API_VERSION < 4000
86     CU_CTX_SCHED_MASK          = 0x03,
87     CU_CTX_FLAGS_MASK          = 0x1f
88 #else
89     CU_CTX_SCHED_MASK          = 0x07,
90     CU_CTX_PRIMARY             = 0x20, /**< Initialize and return the primary context */
91     CU_CTX_FLAGS_MASK          = 0x3f
92 #endif
93 } CUctx_flags;
94 
95 /**
96  * Event creation flags
97  */
98 typedef enum CUevent_flags_enum
99 {
100     CU_EVENT_DEFAULT        = 0, /**< Default event flag */
101     CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
102     CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
103 } CUevent_flags;
104 
105 /**
106  * Array formats
107  */
108 typedef enum CUarray_format_enum
109 {
110     CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
111     CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
112     CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
113     CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
114     CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
115     CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
116     CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
117     CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
118 } CUarray_format;
119 
120 /**
121  * Texture reference addressing modes
122  */
123 typedef enum CUaddress_mode_enum
124 {
125     CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
126     CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
127     CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
128     CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
129 } CUaddress_mode;
130 
131 /**
132  * Texture reference filtering modes
133  */
134 typedef enum CUfilter_mode_enum
135 {
136     CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
137     CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
138 } CUfilter_mode;
139 
140 /**
141  * Device properties
142  */
143 typedef enum CUdevice_attribute_enum
144 {
145     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
146     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
147     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
148     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
149     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
150     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
151     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
152     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
153     CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
154     CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
155     CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
156     CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
157     CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
158     CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
159     CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Peak clock frequency in kilohertz */
160     CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
161     CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently */
162     CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
163     CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
164     CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
165     CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
166     CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
167     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
168     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
169     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
170     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
171     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
172     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
173     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Maximum texture array width */
174     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Maximum texture array height */
175     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
176     CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
177     CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
178     CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
179     CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
180     CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
181     CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35                         /**< Device is using TCC driver model */
182 #if __CUDA_API_VERSION >= 4000
183   , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
184     CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
185     CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
186     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
187     CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
188     CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device uses shares a unified address space with the host */
189     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
190     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43   /**< Maximum layers in a 1D layered texture */
191 #endif
192 } CUdevice_attribute;
193 
194 /**
195  * Legacy device properties
196  */
197 typedef struct CUdevprop_st
198 {
199     int maxThreadsPerBlock;     /**< Maximum number of threads per block */
200     int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
201     int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
202     int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
203     int totalConstantMemory;    /**< Constant memory available on device in bytes */
204     int SIMDWidth;              /**< Warp size in threads */
205     int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
206     int regsPerBlock;           /**< 32-bit registers available per block */
207     int clockRate;              /**< Clock frequency in kilohertz */
208     int textureAlign;           /**< Alignment requirement for textures */
209 } CUdevprop;
210 
211 /**
212  * Function properties
213  */
214 typedef enum CUfunction_attribute_enum
215 {
216     /**
217      * The maximum number of threads per block, beyond which a launch of the
218      * function would fail. This number depends on both the function and the
219      * device on which the function is currently loaded.
220      */
221     CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
222 
223     /**
224      * The size in bytes of statically-allocated shared memory required by
225      * this function. This does not include dynamically-allocated shared
226      * memory requested by the user at runtime.
227      */
228     CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
229 
230     /**
231      * The size in bytes of user-allocated constant memory required by this
232      * function.
233      */
234     CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
235 
236     /**
237      * The size in bytes of local memory used by each thread of this function.
238      */
239     CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
240 
241     /**
242      * The number of registers used by each thread of this function.
243      */
244     CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
245 
246     /**
247      * The PTX virtual architecture version for which the function was
248      * compiled. This value is the major PTX version * 10 + the minor PTX
249      * version, so a PTX version 1.3 function would return the value 13.
250      * Note that this may return the undefined value of 0 for cubins
251      * compiled prior to CUDA 3.0.
252      */
253     CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
254 
255     /**
256      * The binary architecture version for which the function was compiled.
257      * This value is the major binary version * 10 + the minor binary version,
258      * so a binary version 1.3 function would return the value 13. Note that
259      * this will return a value of 10 for legacy cubins that do not have a
260      * properly-encoded binary architecture version.
261      */
262     CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
263 
264     CU_FUNC_ATTRIBUTE_MAX
265 } CUfunction_attribute;
266 
267 /**
268  * Function cache configurations
269  */
270 typedef enum CUfunc_cache_enum
271 {
272     CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
273     CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
274     CU_FUNC_CACHE_PREFER_L1      = 0x02  /**< prefer larger L1 cache and smaller shared memory */
275 } CUfunc_cache;
276 
277 /**
278  * Memory types
279  */
280 typedef enum CUmemorytype_enum
281 {
282     CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
283     CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
284     CU_MEMORYTYPE_ARRAY   = 0x03     /**< Array memory */
285 #if __CUDA_API_VERSION >= 4000
286   , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
287 #endif
288 } CUmemorytype;
289 
290 /**
291  * Compute Modes
292  */
293 typedef enum CUcomputemode_enum
294 {
295     CU_COMPUTEMODE_DEFAULT           = 0,  /**< Default compute mode (Multiple contexts allowed per device) */
296     CU_COMPUTEMODE_EXCLUSIVE         = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
297     CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
298 #if __CUDA_API_VERSION >= 4000
299   , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
300 #endif
301 } CUcomputemode;
302 
303 /**
304  * Online compiler options
305  */
306 typedef enum CUjit_option_enum
307 {
308     /**
309      * Max number of registers that a thread may use.\n
310      * Option type: unsigned int
311      */
312     CU_JIT_MAX_REGISTERS = 0,
313 
314     /**
315      * IN: Specifies minimum number of threads per block to target compilation
316      * for\n
317      * OUT: Returns the number of threads the compiler actually targeted.
318      * This restricts the resource utilization fo the compiler (e.g. max
319      * registers) such that a block with the given number of threads should be
320      * able to launch based on register limitations. Note, this option does not
321      * currently take into account any other resource limitations, such as
322      * shared memory utilization.\n
323      * Option type: unsigned int
324      */
325     CU_JIT_THREADS_PER_BLOCK,
326 
327     /**
328      * Returns a float value in the option of the wall clock time, in
329      * milliseconds, spent creating the cubin\n
330      * Option type: float
331      */
332     CU_JIT_WALL_TIME,
333 
334     /**
335      * Pointer to a buffer in which to print any log messsages from PTXAS
336      * that are informational in nature (the buffer size is specified via
337      * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
338      * Option type: char*
339      */
340     CU_JIT_INFO_LOG_BUFFER,
341 
342     /**
343      * IN: Log buffer size in bytes.  Log messages will be capped at this size
344      * (including null terminator)\n
345      * OUT: Amount of log buffer filled with messages\n
346      * Option type: unsigned int
347      */
348     CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
349 
350     /**
351      * Pointer to a buffer in which to print any log messages from PTXAS that
352      * reflect errors (the buffer size is specified via option
353      * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
354      * Option type: char*
355      */
356     CU_JIT_ERROR_LOG_BUFFER,
357 
358     /**
359      * IN: Log buffer size in bytes.  Log messages will be capped at this size
360      * (including null terminator)\n
361      * OUT: Amount of log buffer filled with messages\n
362      * Option type: unsigned int
363      */
364     CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
365 
366     /**
367      * Level of optimizations to apply to generated code (0 - 4), with 4
368      * being the default and highest level of optimizations.\n
369      * Option type: unsigned int
370      */
371     CU_JIT_OPTIMIZATION_LEVEL,
372 
373     /**
374      * No option value required. Determines the target based on the current
375      * attached context (default)\n
376      * Option type: No option value needed
377      */
378     CU_JIT_TARGET_FROM_CUCONTEXT,
379 
380     /**
381      * Target is chosen based on supplied ::CUjit_target_enum.\n
382      * Option type: unsigned int for enumerated type ::CUjit_target_enum
383      */
384     CU_JIT_TARGET,
385 
386     /**
387      * Specifies choice of fallback strategy if matching cubin is not found.
388      * Choice is based on supplied ::CUjit_fallback_enum.\n
389      * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
390      */
391     CU_JIT_FALLBACK_STRATEGY
392 
393 } CUjit_option;
394 
395 /**
396  * Online compilation targets
397  */
398 typedef enum CUjit_target_enum
399 {
400     CU_TARGET_COMPUTE_10 = 0,   /**< Compute device class 1.0 */
401     CU_TARGET_COMPUTE_11,       /**< Compute device class 1.1 */
402     CU_TARGET_COMPUTE_12,       /**< Compute device class 1.2 */
403     CU_TARGET_COMPUTE_13,       /**< Compute device class 1.3 */
404     CU_TARGET_COMPUTE_20,       /**< Compute device class 2.0 */
405     CU_TARGET_COMPUTE_21        /**< Compute device class 2.1 */
406 } CUjit_target;
407 
408 /**
409  * Cubin matching fallback strategies
410  */
411 typedef enum CUjit_fallback_enum
412 {
413     CU_PREFER_PTX = 0,  /**< Prefer to compile ptx */
414 
415     CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code */
416 
417 } CUjit_fallback;
418 
419 /**
420  * Flags to register a graphics resource
421  */
422 typedef enum CUgraphicsRegisterFlags_enum
423 {
424     CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
425     CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
426     CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
427     CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST  = 0x04
428 } CUgraphicsRegisterFlags;
429 
430 /**
431  * Flags for mapping and unmapping interop resources
432  */
433 typedef enum CUgraphicsMapResourceFlags_enum
434 {
435     CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
436     CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
437     CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
438 } CUgraphicsMapResourceFlags;
439 
440 /**
441  * Array indices for cube faces
442  */
443 typedef enum CUarray_cubemap_face_enum
444 {
445     CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
446     CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
447     CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
448     CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
449     CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
450     CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
451 } CUarray_cubemap_face;
452 
453 /**
454  * Limits
455  */
456 typedef enum CUlimit_enum
457 {
458     CU_LIMIT_STACK_SIZE        = 0x00, /**< GPU thread stack size */
459     CU_LIMIT_PRINTF_FIFO_SIZE  = 0x01, /**< GPU printf FIFO size */
460     CU_LIMIT_MALLOC_HEAP_SIZE  = 0x02  /**< GPU malloc heap size */
461 } CUlimit;
462 
463 /**
464  * Error codes
465  */
466 typedef enum cudaError_enum
467 {
468     /**
469      * The API call returned with no errors. In the case of query calls, this
470      * can also mean that the operation being queried is complete (see
471      * ::cuEventQuery() and ::cuStreamQuery()).
472      */
473     CUDA_SUCCESS                              = 0,
474 
475     /**
476      * This indicates that one or more of the parameters passed to the API call
477      * is not within an acceptable range of values.
478      */
479     CUDA_ERROR_INVALID_VALUE                  = 1,
480 
481     /**
482      * The API call failed because it was unable to allocate enough memory to
483      * perform the requested operation.
484      */
485     CUDA_ERROR_OUT_OF_MEMORY                  = 2,
486 
487     /**
488      * This indicates that the CUDA driver has not been initialized with
489      * ::cuInit() or that initialization has failed.
490      */
491     CUDA_ERROR_NOT_INITIALIZED                = 3,
492 
493     /**
494      * This indicates that the CUDA driver is in the process of shutting down.
495      */
496     CUDA_ERROR_DEINITIALIZED                  = 4,
497 
498     /**
499      * This indicates profiling APIs are called while application is running
500      * in visual profiler mode.
501     */
502     CUDA_ERROR_PROFILER_DISABLED           = 5,
503     /**
504      * This indicates profiling has not been initialized for this context.
505      * Call cuProfilerInitialize() to resolve this.
506     */
507     CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
508     /**
509      * This indicates profiler has already been started and probably
510      * cuProfilerStart() is incorrectly called.
511     */
512     CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
513     /**
514      * This indicates profiler has already been stopped and probably
515      * cuProfilerStop() is incorrectly called.
516     */
517     CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
518     /**
519      * This indicates that no CUDA-capable devices were detected by the installed
520      * CUDA driver.
521      */
522     CUDA_ERROR_NO_DEVICE                      = 100,
523 
524     /**
525      * This indicates that the device ordinal supplied by the user does not
526      * correspond to a valid CUDA device.
527      */
528     CUDA_ERROR_INVALID_DEVICE                 = 101,
529 
530 
531     /**
532      * This indicates that the device kernel image is invalid. This can also
533      * indicate an invalid CUDA module.
534      */
535     CUDA_ERROR_INVALID_IMAGE                  = 200,
536 
537     /**
538      * This most frequently indicates that there is no context bound to the
539      * current thread. This can also be returned if the context passed to an
540      * API call is not a valid handle (such as a context that has had
541      * ::cuCtxDestroy() invoked on it). This can also be returned if a user
542      * mixes different API versions (i.e. 3010 context with 3020 API calls).
543      * See ::cuCtxGetApiVersion() for more details.
544      */
545     CUDA_ERROR_INVALID_CONTEXT                = 201,
546 
547     /**
548      * This indicated that the context being supplied as a parameter to the
549      * API call was already the active context.
550      * \deprecated
551      * This error return is deprecated as of CUDA 3.2. It is no longer an
552      * error to attempt to push the active context via ::cuCtxPushCurrent().
553      */
554     CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
555 
556     /**
557      * This indicates that a map or register operation has failed.
558      */
559     CUDA_ERROR_MAP_FAILED                     = 205,
560 
561     /**
562      * This indicates that an unmap or unregister operation has failed.
563      */
564     CUDA_ERROR_UNMAP_FAILED                   = 206,
565 
566     /**
567      * This indicates that the specified array is currently mapped and thus
568      * cannot be destroyed.
569      */
570     CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
571 
572     /**
573      * This indicates that the resource is already mapped.
574      */
575     CUDA_ERROR_ALREADY_MAPPED                 = 208,
576 
577     /**
578      * This indicates that there is no kernel image available that is suitable
579      * for the device. This can occur when a user specifies code generation
580      * options for a particular CUDA source file that do not include the
581      * corresponding device configuration.
582      */
583     CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
584 
585     /**
586      * This indicates that a resource has already been acquired.
587      */
588     CUDA_ERROR_ALREADY_ACQUIRED               = 210,
589 
590     /**
591      * This indicates that a resource is not mapped.
592      */
593     CUDA_ERROR_NOT_MAPPED                     = 211,
594 
595     /**
596      * This indicates that a mapped resource is not available for access as an
597      * array.
598      */
599     CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
600 
601     /**
602      * This indicates that a mapped resource is not available for access as a
603      * pointer.
604      */
605     CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
606 
607     /**
608      * This indicates that an uncorrectable ECC error was detected during
609      * execution.
610      */
611     CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
612 
613     /**
614      * This indicates that the ::CUlimit passed to the API call is not
615      * supported by the active device.
616      */
617     CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
618 
619     /**
620      * This indicates that the ::CUcontext passed to the API call can
621      * only be bound to a single CPU thread at a time but is already
622      * bound to a CPU thread.
623      */
624     CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
625 
626     /**
627      * This indicates that the device kernel source is invalid.
628      */
629     CUDA_ERROR_INVALID_SOURCE                 = 300,
630 
631     /**
632      * This indicates that the file specified was not found.
633      */
634     CUDA_ERROR_FILE_NOT_FOUND                 = 301,
635 
636     /**
637      * This indicates that a link to a shared object failed to resolve.
638      */
639     CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
640 
641     /**
642      * This indicates that initialization of a shared object failed.
643      */
644     CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
645 
646     /**
647      * This indicates that an OS call failed.
648      */
649     CUDA_ERROR_OPERATING_SYSTEM               = 304,
650 
651 
652     /**
653      * This indicates that a resource handle passed to the API call was not
654      * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
655      */
656     CUDA_ERROR_INVALID_HANDLE                 = 400,
657 
658 
659     /**
660      * This indicates that a named symbol was not found. Examples of symbols
661      * are global/constant variable names, texture names, and surface names.
662      */
663     CUDA_ERROR_NOT_FOUND                      = 500,
664 
665 
666     /**
667      * This indicates that asynchronous operations issued previously have not
668      * completed yet. This result is not actually an error, but must be indicated
669      * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
670      * may return this value include ::cuEventQuery() and ::cuStreamQuery().
671      */
672     CUDA_ERROR_NOT_READY                      = 600,
673 
674 
675     /**
676      * An exception occurred on the device while executing a kernel. Common
677      * causes include dereferencing an invalid device pointer and accessing
678      * out of bounds shared memory. The context cannot be used, so it must
679      * be destroyed (and a new one should be created). All existing device
680      * memory allocations from this context are invalid and must be
681      * reconstructed if the program is to continue using CUDA.
682      */
683     CUDA_ERROR_LAUNCH_FAILED                  = 700,
684 
685     /**
686      * This indicates that a launch did not occur because it did not have
687      * appropriate resources. This error usually indicates that the user has
688      * attempted to pass too many arguments to the device kernel, or the
689      * kernel launch specifies too many threads for the kernel's register
690      * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
691      * when a 32-bit int is expected) is equivalent to passing too many
692      * arguments and can also result in this error.
693      */
694     CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
695 
696     /**
697      * This indicates that the device kernel took too long to execute. This can
698      * only occur if timeouts are enabled - see the device attribute
699      * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
700      * context cannot be used (and must be destroyed similar to
701      * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
702      * this context are invalid and must be reconstructed if the program is to
703      * continue using CUDA.
704      */
705     CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
706 
707     /**
708      * This error indicates a kernel launch that uses an incompatible texturing
709      * mode.
710      */
711     CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
712 
713     /**
714      * This error indicates that a call to ::cuCtxEnablePeerAccess() is
715      * trying to re-enable peer access to a context which has already
716      * had peer access to it enabled.
717      */
718     CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
719 
720     /**
721      * This error indicates that a call to ::cuMemPeerRegister is trying to
722      * register memory from a context which has not had peer access
723      * enabled yet via ::cuCtxEnablePeerAccess(), or that
724      * ::cuCtxDisablePeerAccess() is trying to disable peer access
725      * which has not been enabled yet.
726      */
727     CUDA_ERROR_PEER_ACCESS_NOT_ENABLED    = 705,
728 
729     /**
730      * This error indicates that a call to ::cuMemPeerRegister is trying to
731      * register already-registered memory.
732      */
733     CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
734 
735     /**
736      * This error indicates that a call to ::cuMemPeerUnregister is trying to
737      * unregister memory that has not been registered.
738      */
739     CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED     = 707,
740 
741     /**
742      * This error indicates that ::cuCtxCreate was called with the flag
743      * ::CU_CTX_PRIMARY on a device which already has initialized its
744      * primary context.
745      */
746     CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
747 
748     /**
749      * This error indicates that the context current to the calling thread
750      * has been destroyed using ::cuCtxDestroy, or is a primary context which
751      * has not yet been initialized.
752      */
753     CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
754 
755     /**
756      * This indicates that an unknown internal error has occurred.
757      */
758     CUDA_ERROR_UNKNOWN                        = 999
759 } CUresult;
760 
761 #if __CUDA_API_VERSION >= 4000
762 /**
763  * If set, host memory is portable between CUDA contexts.
764  * Flag for ::cuMemHostAlloc()
765  */
766 #define CU_MEMHOSTALLOC_PORTABLE        0x01
767 
768 /**
769  * If set, host memory is mapped into CUDA address space and
770  * ::cuMemHostGetDevicePointer() may be called on the host pointer.
771  * Flag for ::cuMemHostAlloc()
772  */
773 #define CU_MEMHOSTALLOC_DEVICEMAP       0x02
774 
775 /**
776  * If set, host memory is allocated as write-combined - fast to write,
777  * faster to DMA, slow to read except via SSE4 streaming load instruction
778  * (MOVNTDQA).
779  * Flag for ::cuMemHostAlloc()
780  */
781 #define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
782 
783 /**
784  * If set, host memory is portable between CUDA contexts.
785  * Flag for ::cuMemHostRegister()
786  */
787 #define CU_MEMHOSTREGISTER_PORTABLE     0x01
788 
789 /**
790  * If set, host memory is mapped into CUDA address space and
791  * ::cuMemHostGetDevicePointer() may be called on the host pointer.
792  * Flag for ::cuMemHostRegister()
793  */
794 #define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
795 
796 /**
797  * If set, peer memory is mapped into CUDA address space and
798  * ::cuMemPeerGetDevicePointer() may be called on the host pointer.
799  * Flag for ::cuMemPeerRegister()
800  */
801 #define CU_MEMPEERREGISTER_DEVICEMAP    0x02
802 #endif
803 
804 #if __CUDA_API_VERSION >= 3020
805 /**
806  * 2D memory copy parameters
807  */
808 typedef struct CUDA_MEMCPY2D_st
809 {
810     size_t srcXInBytes;         /**< Source X in bytes */
811     size_t srcY;                /**< Source Y */
812 
813     CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
814     const void *srcHost;        /**< Source host pointer */
815     CUdeviceptr srcDevice;      /**< Source device pointer */
816     CUarray srcArray;           /**< Source array reference */
817     size_t srcPitch;            /**< Source pitch (ignored when src is array) */
818 
819     size_t dstXInBytes;         /**< Destination X in bytes */
820     size_t dstY;                /**< Destination Y */
821 
822     CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
823     void *dstHost;              /**< Destination host pointer */
824     CUdeviceptr dstDevice;      /**< Destination device pointer */
825     CUarray dstArray;           /**< Destination array reference */
826     size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
827 
828     size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
829     size_t Height;              /**< Height of 2D memory copy */
830 } CUDA_MEMCPY2D;
831 
832 /**
833  * 3D memory copy parameters
834  */
835 typedef struct CUDA_MEMCPY3D_st
836 {
837     size_t srcXInBytes;         /**< Source X in bytes */
838     size_t srcY;                /**< Source Y */
839     size_t srcZ;                /**< Source Z */
840     size_t srcLOD;              /**< Source LOD */
841     CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
842     const void *srcHost;        /**< Source host pointer */
843     CUdeviceptr srcDevice;      /**< Source device pointer */
844     CUarray srcArray;           /**< Source array reference */
845     void *reserved0;            /**< Must be NULL */
846     size_t srcPitch;            /**< Source pitch (ignored when src is array) */
847     size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
848 
849     size_t dstXInBytes;         /**< Destination X in bytes */
850     size_t dstY;                /**< Destination Y */
851     size_t dstZ;                /**< Destination Z */
852     size_t dstLOD;              /**< Destination LOD */
853     CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
854     void *dstHost;              /**< Destination host pointer */
855     CUdeviceptr dstDevice;      /**< Destination device pointer */
856     CUarray dstArray;           /**< Destination array reference */
857     void *reserved1;            /**< Must be NULL */
858     size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
859     size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
860 
861     size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
862     size_t Height;              /**< Height of 3D memory copy */
863     size_t Depth;               /**< Depth of 3D memory copy */
864 } CUDA_MEMCPY3D;
865 
866 /**
867  * 3D memory cross-context copy parameters
868  */
869 typedef struct CUDA_MEMCPY3D_PEER_st
870 {
871     size_t srcXInBytes;         /**< Source X in bytes */
872     size_t srcY;                /**< Source Y */
873     size_t srcZ;                /**< Source Z */
874     size_t srcLOD;              /**< Source LOD */
875     CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
876     const void *srcHost;        /**< Source host pointer */
877     CUdeviceptr srcDevice;      /**< Source device pointer */
878     CUarray srcArray;           /**< Source array reference */
879     CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
880     size_t srcPitch;            /**< Source pitch (ignored when src is array) */
881     size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
882 
883     size_t dstXInBytes;         /**< Destination X in bytes */
884     size_t dstY;                /**< Destination Y */
885     size_t dstZ;                /**< Destination Z */
886     size_t dstLOD;              /**< Destination LOD */
887     CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
888     void *dstHost;              /**< Destination host pointer */
889     CUdeviceptr dstDevice;      /**< Destination device pointer */
890     CUarray dstArray;           /**< Destination array reference */
891     CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
892     size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
893     size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
894 
895     size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
896     size_t Height;              /**< Height of 3D memory copy */
897     size_t Depth;               /**< Depth of 3D memory copy */
898 } CUDA_MEMCPY3D_PEER;
899 
900 /**
901  * Array descriptor
902  */
903 typedef struct CUDA_ARRAY_DESCRIPTOR_st
904 {
905     size_t Width;             /**< Width of array */
906     size_t Height;            /**< Height of array */
907 
908     CUarray_format Format;    /**< Array format */
909     unsigned int NumChannels; /**< Channels per array element */
910 } CUDA_ARRAY_DESCRIPTOR;
911 
912 /**
913  * 3D array descriptor
914  */
915 typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
916 {
917     size_t Width;             /**< Width of 3D array */
918     size_t Height;            /**< Height of 3D array */
919     size_t Depth;             /**< Depth of 3D array */
920 
921     CUarray_format Format;    /**< Array format */
922     unsigned int NumChannels; /**< Channels per array element */
923     unsigned int Flags;       /**< Flags */
924 } CUDA_ARRAY3D_DESCRIPTOR;
925 
926 #endif /* __CUDA_API_VERSION >= 3020 */
927 
928 /**
929  * If set, the CUDA array is a collection of layers, where each layer is either a 1D
930  * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
931  * of layers, not the depth of a 3D array.
932  */
933 #define CUDA_ARRAY3D_LAYERED        0x01
934 
935 /**
936  * Deprecated, use CUDA_ARRAY3D_LAYERED
937  */
938 #define CUDA_ARRAY3D_2DARRAY        0x01
939 
940 /**
941  * This flag must be set in order to bind a surface reference
942  * to the CUDA array
943  */
944 #define CUDA_ARRAY3D_SURFACE_LDST   0x02
945 
946 /**
947  * Override the texref format with a format inferred from the array.
948  * Flag for ::cuTexRefSetArray()
949  */
950 #define CU_TRSA_OVERRIDE_FORMAT 0x01
951 
952 /**
953  * Read the texture as integers rather than promoting the values to floats
954  * in the range [0,1].
955  * Flag for ::cuTexRefSetFlags()
956  */
957 #define CU_TRSF_READ_AS_INTEGER         0x01
958 
959 /**
960  * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
961  * Flag for ::cuTexRefSetFlags()
962  */
963 #define CU_TRSF_NORMALIZED_COORDINATES  0x02
964 
965 /**
966  * Perform sRGB->linear conversion during texture read.
967  * Flag for ::cuTexRefSetFlags()
968  */
969 #define CU_TRSF_SRGB  0x10
970 
971 /**
972  * End of array terminator for the \p extra parameter to
973  * ::cuLaunchKernel
974  */
975 #define CU_LAUNCH_PARAM_END            ((void*)0x00)
976 
977 /**
978  * Indicator that the next value in the \p extra parameter to
979  * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
980  * parameters used for launching kernel \p f.  This buffer needs to
981  * honor all alignment/padding requirements of the individual parameters.
982  * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
983  * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
984  * effect.
985  */
986 #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
987 
988 /**
989  * Indicator that the next value in the \p extra parameter to
990  * ::cuLaunchKernel will be a pointer to a size_t which contains the
991  * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
992  * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
993  * in the \p extra array if the value associated with
994  * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
995  */
996 #define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
997 
998 /**
999  * For texture references loaded into the module, use default texunit from
1000  * texture reference.
1001  */
1002 #define CU_PARAM_TR_DEFAULT -1
1003 
1004 /**
1005  * CUDA API made obselete at API version 3020
1006  */
1007 #if defined(__CUDA_API_VERSION_INTERNAL)
1008     #define CUdeviceptr                  CUdeviceptr_v1
1009     #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
1010     #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
1011     #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
1012     #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
1013     #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
1014     #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
1015     #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
1016     #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
1017 #endif /* CUDA_FORCE_LEGACY32_INTERNAL */
1018 
1019 #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
1020 typedef unsigned int CUdeviceptr;
1021 
1022 typedef struct CUDA_MEMCPY2D_st
1023 {
1024     unsigned int srcXInBytes;   /**< Source X in bytes */
1025     unsigned int srcY;          /**< Source Y */
1026     CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
1027     const void *srcHost;        /**< Source host pointer */
1028     CUdeviceptr srcDevice;      /**< Source device pointer */
1029     CUarray srcArray;           /**< Source array reference */
1030     unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
1031 
1032     unsigned int dstXInBytes;   /**< Destination X in bytes */
1033     unsigned int dstY;          /**< Destination Y */
1034     CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
1035     void *dstHost;              /**< Destination host pointer */
1036     CUdeviceptr dstDevice;      /**< Destination device pointer */
1037     CUarray dstArray;           /**< Destination array reference */
1038     unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
1039 
1040     unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
1041     unsigned int Height;        /**< Height of 2D memory copy */
1042 } CUDA_MEMCPY2D;
1043 
1044 typedef struct CUDA_MEMCPY3D_st
1045 {
1046     unsigned int srcXInBytes;   /**< Source X in bytes */
1047     unsigned int srcY;          /**< Source Y */
1048     unsigned int srcZ;          /**< Source Z */
1049     unsigned int srcLOD;        /**< Source LOD */
1050     CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
1051     const void *srcHost;        /**< Source host pointer */
1052     CUdeviceptr srcDevice;      /**< Source device pointer */
1053     CUarray srcArray;           /**< Source array reference */
1054     void *reserved0;            /**< Must be NULL */
1055     unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
1056     unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
1057 
1058     unsigned int dstXInBytes;   /**< Destination X in bytes */
1059     unsigned int dstY;          /**< Destination Y */
1060     unsigned int dstZ;          /**< Destination Z */
1061     unsigned int dstLOD;        /**< Destination LOD */
1062     CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
1063     void *dstHost;              /**< Destination host pointer */
1064     CUdeviceptr dstDevice;      /**< Destination device pointer */
1065     CUarray dstArray;           /**< Destination array reference */
1066     void *reserved1;            /**< Must be NULL */
1067     unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
1068     unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
1069 
1070     unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
1071     unsigned int Height;        /**< Height of 3D memory copy */
1072     unsigned int Depth;         /**< Depth of 3D memory copy */
1073 } CUDA_MEMCPY3D;
1074 
1075 typedef struct CUDA_ARRAY_DESCRIPTOR_st
1076 {
1077     unsigned int Width;         /**< Width of array */
1078     unsigned int Height;        /**< Height of array */
1079 
1080     CUarray_format Format;      /**< Array format */
1081     unsigned int NumChannels;   /**< Channels per array element */
1082 } CUDA_ARRAY_DESCRIPTOR;
1083 
1084 typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
1085 {
1086     unsigned int Width;         /**< Width of 3D array */
1087     unsigned int Height;        /**< Height of 3D array */
1088     unsigned int Depth;         /**< Depth of 3D array */
1089 
1090     CUarray_format Format;      /**< Array format */
1091     unsigned int NumChannels;   /**< Channels per array element */
1092     unsigned int Flags;         /**< Flags */
1093 } CUDA_ARRAY3D_DESCRIPTOR;
1094 
1095 #endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */
1096 
1097 /*
1098  * If set, the CUDA array contains an array of 2D slices
1099  * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
1100  * the number of slices, not the depth of a 3D array.
1101  */
1102 #define CUDA_ARRAY3D_2DARRAY        0x01
1103 
1104 /**
1105  * This flag must be set in order to bind a surface reference
1106  * to the CUDA array
1107  */
1108 #define CUDA_ARRAY3D_SURFACE_LDST   0x02
1109 
1110 /**
1111  * Override the texref format with a format inferred from the array.
1112  * Flag for ::cuTexRefSetArray()
1113  */
1114 #define CU_TRSA_OVERRIDE_FORMAT 0x01
1115 
1116 /**
1117  * Read the texture as integers rather than promoting the values to floats
1118  * in the range [0,1].
1119  * Flag for ::cuTexRefSetFlags()
1120  */
1121 #define CU_TRSF_READ_AS_INTEGER         0x01
1122 
1123 /**
1124  * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
1125  * Flag for ::cuTexRefSetFlags()
1126  */
1127 #define CU_TRSF_NORMALIZED_COORDINATES  0x02
1128 
1129 /**
1130  * Perform sRGB->linear conversion during texture read.
1131  * Flag for ::cuTexRefSetFlags()
1132  */
1133 #define CU_TRSF_SRGB  0x10
1134 
1135 /**
1136  * For texture references loaded into the module, use default texunit from
1137  * texture reference.
1138  */
1139 #define CU_PARAM_TR_DEFAULT -1
1140 
1141 /** @} */ /* END CUDA_TYPES */
1142 
1143 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
1144     #define CUDAAPI __stdcall
1145 #else
1146     #define CUDAAPI
1147 #endif
1148 
1149 /**
1150  * \defgroup CUDA_INITIALIZE Initialization
1151  *
1152  * This section describes the initialization functions of the low-level CUDA
1153  * driver application programming interface.
1154  *
1155  * @{
1156  */
1157 
1158 /*********************************
1159  ** Initialization
1160  *********************************/
1161 typedef CUresult  CUDAAPI tcuInit(unsigned int Flags);
1162 
1163 /*********************************
1164  ** Driver Version Query
1165  *********************************/
1166 typedef CUresult  CUDAAPI tcuDriverGetVersion(int *driverVersion);
1167 
1168 /************************************
1169  **
1170  **    Device management
1171  **
1172  ***********************************/
1173 
1174 typedef CUresult  CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
1175 typedef CUresult  CUDAAPI tcuDeviceGetCount(int *count);
1176 typedef CUresult  CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
1177 typedef CUresult  CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
1178 #if __CUDA_API_VERSION >= 3020
1179     typedef CUresult  CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
1180 #else
1181     typedef CUresult  CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
1182 #endif
1183 
1184 typedef CUresult  CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
1185 typedef CUresult  CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
1186 
1187 /************************************
1188  **
1189  **    Context management
1190  **
1191  ***********************************/
1192 typedef CUresult  CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
1193 typedef CUresult  CUDAAPI tcuCtxDestroy(CUcontext ctx);
1194 typedef CUresult  CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
1195 typedef CUresult  CUDAAPI tcuCtxDetach(CUcontext ctx);
1196 typedef CUresult  CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
1197 typedef CUresult  CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
1198 
1199 typedef CUresult  CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
1200 typedef CUresult  CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
1201 
1202 typedef CUresult  CUDAAPI tcuCtxGetDevice(CUdevice *device);
1203 typedef CUresult  CUDAAPI tcuCtxSynchronize(void);
1204 
1205 
1206 /************************************
1207  **
1208  **    Module management
1209  **
1210  ***********************************/
1211 typedef CUresult  CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
1212 typedef CUresult  CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
1213 typedef CUresult  CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
1214 typedef CUresult  CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
1215 typedef CUresult  CUDAAPI tcuModuleUnload(CUmodule hmod);
1216 typedef CUresult  CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
1217 
1218 #if __CUDA_API_VERSION >= 3020
1219     typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
1220 #else
1221     typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
1222 #endif
1223 
1224 typedef CUresult  CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
1225 typedef CUresult  CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
1226 
1227 /************************************
1228  **
1229  **    Memory management
1230  **
1231  ***********************************/
1232 #if __CUDA_API_VERSION >= 3020
1233     typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
1234     typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
1235     typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
1236     typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
1237                                               size_t *pPitch,
1238                                               size_t WidthInBytes,
1239                                               size_t Height,
1240                                               // size of biggest r/w to be performed by kernels on this memory
1241                                               // 4, 8 or 16 bytes
1242                                               unsigned int ElementSizeBytes
1243                                              );
1244 #else
1245     typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
1246     typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
1247     typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
1248     typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
1249                                               unsigned int *pPitch,
1250                                               unsigned int WidthInBytes,
1251                                               unsigned int Height,
1252                                               // size of biggest r/w to be performed by kernels on this memory
1253                                               // 4, 8 or 16 bytes
1254                                               unsigned int ElementSizeBytes
1255                                              );
1256 #endif
1257 
1258 typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
1259 
1260 #if __CUDA_API_VERSION >= 3020
1261     typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
1262 #else
1263     typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
1264 #endif
1265 
1266 typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
1267 typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
1268 
1269 typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
1270 typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
1271 
1272 typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
1273 typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
1274 typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
1275 typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
1276 
1277 /************************************
1278  **
1279  **    Synchronous Memcpy
1280  **
1281  ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
1282  ** but if host memory is involved, they wait until the copy is done before returning.
1283  **
1284  ***********************************/
1285 // 1D functions
1286 #if __CUDA_API_VERSION >= 3020
1287     // system <-> device memory
1288     typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
1289     typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
1290 
1291     // device <-> device memory
1292     typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
1293 
1294     // device <-> array memory
1295     typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
1296     typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
1297 
1298     // system <-> array memory
1299     typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
1300     typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
1301 
1302     // array <-> array memory
1303     typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
1304 #else
1305     // system <-> device memory
1306     typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
1307     typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
1308 
1309     // device <-> device memory
1310     typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
1311 
1312     // device <-> array memory
1313     typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
1314     typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
1315 
1316     // system <-> array memory
1317     typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
1318     typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
1319 
1320     // array <-> array memory
1321     typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
1322 #endif
1323 
1324 // 2D memcpy
1325 typedef CUresult  CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
1326 typedef CUresult  CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
1327 
1328 // 3D memcpy
1329 typedef CUresult  CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
1330 
1331 /************************************
1332  **
1333  **    Asynchronous Memcpy
1334  **
1335  ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
1336  ** memcpy's done with these functions execute in parallel with the CPU and, if
1337  ** the hardware is available, may execute in parallel with the GPU.
1338  ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
1339  **
1340  ***********************************/
1341 
1342 // 1D functions
1343 #if __CUDA_API_VERSION >= 3020
1344     // system <-> device memory
1345     typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
1346                                                  const void *srcHost, size_t ByteCount, CUstream hStream);
1347     typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
1348                                                  CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
1349 
1350     // device <-> device memory
1351     typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
1352                                                  CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
1353 
1354     // system <-> array memory
1355     typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
1356                                                  const void *srcHost, size_t ByteCount, CUstream hStream);
1357     typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
1358                                                  size_t ByteCount, CUstream hStream);
1359 #else
1360     // system <-> device memory
1361     typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
1362                                                  const void *srcHost, unsigned int ByteCount, CUstream hStream);
1363     typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
1364                                                  CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
1365 
1366     // device <-> device memory
1367     typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
1368                                                  CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
1369 
1370     // system <-> array memory
1371     typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
1372                                                  const void *srcHost, unsigned int ByteCount, CUstream hStream);
1373     typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
1374                                                  unsigned int ByteCount, CUstream hStream);
1375 #endif
1376 
1377 // 2D memcpy
1378 typedef CUresult  CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
1379 
1380 // 3D memcpy
1381 typedef CUresult  CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
1382 
1383 /************************************
1384  **
1385  **    Memset
1386  **
1387  ***********************************/
1388 #if __CUDA_API_VERSION >= 3020
1389     typedef CUresult  CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
1390     typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
1391     typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
1392     typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
1393     typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
1394     typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
1395 #else
1396     typedef CUresult  CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
1397     typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
1398     typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
1399     typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
1400     typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
1401     typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
1402 #endif
1403 
1404 /************************************
1405  **
1406  **    Function management
1407  **
1408  ***********************************/
1409 
1410 
1411 typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
1412 typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
1413 typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
1414 typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
1415 typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
1416                                          unsigned int gridDimX,  unsigned int gridDimY,  unsigned int gridDimZ,
1417                                          unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
1418                                          unsigned int sharedMemBytes,
1419                                          CUstream hStream, void **kernelParams, void **extra);
1420 
1421 /************************************
1422  **
1423  **    Array management
1424  **
1425  ***********************************/
1426 
1427 typedef CUresult  CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
1428 typedef CUresult  CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
1429 typedef CUresult  CUDAAPI tcuArrayDestroy(CUarray hArray);
1430 
1431 typedef CUresult  CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
1432 typedef CUresult  CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
1433 
1434 
1435 /************************************
1436  **
1437  **    Texture reference management
1438  **
1439  ***********************************/
1440 typedef CUresult  CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
1441 typedef CUresult  CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
1442 
1443 typedef CUresult  CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
1444 
1445 #if __CUDA_API_VERSION >= 3020
1446     typedef CUresult  CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
1447     typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
1448 #else
1449     typedef CUresult  CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
1450     typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
1451 #endif
1452 
1453 typedef CUresult  CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
1454 typedef CUresult  CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
1455 typedef CUresult  CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
1456 typedef CUresult  CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
1457 
1458 typedef CUresult  CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
1459 typedef CUresult  CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
1460 typedef CUresult  CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
1461 typedef CUresult  CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
1462 typedef CUresult  CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
1463 typedef CUresult  CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
1464 
1465 /************************************
1466  **
1467  **    Surface reference management
1468  **
1469  ***********************************/
1470 typedef CUresult  CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
1471 typedef CUresult  CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
1472 
1473 /************************************
1474  **
1475  **    Parameter management
1476  **
1477  ***********************************/
1478 
1479 typedef CUresult  CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
1480 typedef CUresult  CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
1481 typedef CUresult  CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
1482 typedef CUresult  CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
1483 typedef CUresult  CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
1484 
1485 
1486 /************************************
1487  **
1488  **    Launch functions
1489  **
1490  ***********************************/
1491 
1492 typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
1493 typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
1494 typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
1495 
1496 /************************************
1497  **
1498  **    Events
1499  **
1500  ***********************************/
1501 typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
1502 typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
1503 typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
1504 typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
1505 typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
1506 typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
1507 
1508 /************************************
1509  **
1510  **    Streams
1511  **
1512  ***********************************/
1513 typedef CUresult CUDAAPI  tcuStreamCreate(CUstream *phStream, unsigned int Flags);
1514 typedef CUresult CUDAAPI  tcuStreamQuery(CUstream hStream);
1515 typedef CUresult CUDAAPI  tcuStreamSynchronize(CUstream hStream);
1516 typedef CUresult CUDAAPI  tcuStreamDestroy(CUstream hStream);
1517 
1518 /************************************
1519  **
1520  **    Graphics interop
1521  **
1522  ***********************************/
1523 typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
1524 typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
1525 
1526 #if __CUDA_API_VERSION >= 3020
1527     typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
1528 #else
1529     typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
1530 #endif
1531 
1532 typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
1533 typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
1534 typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
1535 
1536 /************************************
1537  **
1538  **    Export tables
1539  **
1540  ***********************************/
1541 typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
1542 
1543 /************************************
1544  **
1545  **    Limits
1546  **
1547  ***********************************/
1548 
1549 typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
1550 typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
1551 
1552 
1553 extern tcuDriverGetVersion             *cuDriverGetVersion;
1554 extern tcuDeviceGet                    *cuDeviceGet;
1555 extern tcuDeviceGetCount               *cuDeviceGetCount;
1556 extern tcuDeviceGetName                *cuDeviceGetName;
1557 extern tcuDeviceComputeCapability      *cuDeviceComputeCapability;
1558 extern tcuDeviceGetProperties          *cuDeviceGetProperties;
1559 extern tcuDeviceGetAttribute           *cuDeviceGetAttribute;
1560 extern tcuCtxDestroy                   *cuCtxDestroy;
1561 extern tcuCtxAttach                    *cuCtxAttach;
1562 extern tcuCtxDetach                    *cuCtxDetach;
1563 extern tcuCtxPushCurrent               *cuCtxPushCurrent;
1564 extern tcuCtxPopCurrent                *cuCtxPopCurrent;
1565 
1566 extern tcuCtxSetCurrent                *cuCtxSetCurrent;
1567 extern tcuCtxGetCurrent                *cuCtxGetCurrent;
1568 
1569 extern tcuCtxGetDevice                 *cuCtxGetDevice;
1570 extern tcuCtxSynchronize               *cuCtxSynchronize;
1571 extern tcuModuleLoad                   *cuModuleLoad;
1572 extern tcuModuleLoadData               *cuModuleLoadData;
1573 extern tcuModuleLoadDataEx             *cuModuleLoadDataEx;
1574 extern tcuModuleLoadFatBinary          *cuModuleLoadFatBinary;
1575 extern tcuModuleUnload                 *cuModuleUnload;
1576 extern tcuModuleGetFunction            *cuModuleGetFunction;
1577 extern tcuModuleGetTexRef              *cuModuleGetTexRef;
1578 extern tcuModuleGetSurfRef             *cuModuleGetSurfRef;
1579 extern tcuMemFreeHost                  *cuMemFreeHost;
1580 extern tcuMemHostAlloc                 *cuMemHostAlloc;
1581 extern tcuMemHostGetFlags              *cuMemHostGetFlags;
1582 
1583 extern tcuMemHostRegister              *cuMemHostRegister;
1584 extern tcuMemHostUnregister            *cuMemHostUnregister;
1585 extern tcuMemcpy                       *cuMemcpy;
1586 extern tcuMemcpyPeer                   *cuMemcpyPeer;
1587 
1588 extern tcuDeviceTotalMem               *cuDeviceTotalMem;
1589 extern tcuCtxCreate                    *cuCtxCreate;
1590 extern tcuModuleGetGlobal              *cuModuleGetGlobal;
1591 extern tcuMemGetInfo                   *cuMemGetInfo;
1592 extern tcuMemAlloc                     *cuMemAlloc;
1593 extern tcuMemAllocPitch                *cuMemAllocPitch;
1594 extern tcuMemFree                      *cuMemFree;
1595 extern tcuMemGetAddressRange           *cuMemGetAddressRange;
1596 extern tcuMemAllocHost                 *cuMemAllocHost;
1597 extern tcuMemHostGetDevicePointer      *cuMemHostGetDevicePointer;
1598 extern tcuFuncSetBlockShape            *cuFuncSetBlockShape;
1599 extern tcuFuncSetSharedSize            *cuFuncSetSharedSize;
1600 extern tcuFuncGetAttribute             *cuFuncGetAttribute;
1601 extern tcuFuncSetCacheConfig           *cuFuncSetCacheConfig;
1602 extern tcuLaunchKernel                 *cuLaunchKernel;
1603 extern tcuArrayDestroy                 *cuArrayDestroy;
1604 extern tcuTexRefCreate                 *cuTexRefCreate;
1605 extern tcuTexRefDestroy                *cuTexRefDestroy;
1606 extern tcuTexRefSetArray               *cuTexRefSetArray;
1607 extern tcuTexRefSetFormat              *cuTexRefSetFormat;
1608 extern tcuTexRefSetAddressMode         *cuTexRefSetAddressMode;
1609 extern tcuTexRefSetFilterMode          *cuTexRefSetFilterMode;
1610 extern tcuTexRefSetFlags               *cuTexRefSetFlags;
1611 extern tcuTexRefGetArray               *cuTexRefGetArray;
1612 extern tcuTexRefGetAddressMode         *cuTexRefGetAddressMode;
1613 extern tcuTexRefGetFilterMode          *cuTexRefGetFilterMode;
1614 extern tcuTexRefGetFormat              *cuTexRefGetFormat;
1615 extern tcuTexRefGetFlags               *cuTexRefGetFlags;
1616 extern tcuSurfRefSetArray              *cuSurfRefSetArray;
1617 extern tcuSurfRefGetArray              *cuSurfRefGetArray;
1618 extern tcuParamSetSize                 *cuParamSetSize;
1619 extern tcuParamSeti                    *cuParamSeti;
1620 extern tcuParamSetf                    *cuParamSetf;
1621 extern tcuParamSetv                    *cuParamSetv;
1622 extern tcuParamSetTexRef               *cuParamSetTexRef;
1623 extern tcuLaunch                       *cuLaunch;
1624 extern tcuLaunchGrid                   *cuLaunchGrid;
1625 extern tcuLaunchGridAsync              *cuLaunchGridAsync;
1626 extern tcuEventCreate                  *cuEventCreate;
1627 extern tcuEventRecord                  *cuEventRecord;
1628 extern tcuEventQuery                   *cuEventQuery;
1629 extern tcuEventSynchronize             *cuEventSynchronize;
1630 extern tcuEventDestroy                 *cuEventDestroy;
1631 extern tcuEventElapsedTime             *cuEventElapsedTime;
1632 extern tcuStreamCreate                 *cuStreamCreate;
1633 extern tcuStreamQuery                  *cuStreamQuery;
1634 extern tcuStreamSynchronize            *cuStreamSynchronize;
1635 extern tcuStreamDestroy                *cuStreamDestroy;
1636 extern tcuGraphicsUnregisterResource   *cuGraphicsUnregisterResource;
1637 extern tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
1638 extern tcuGraphicsResourceSetMapFlags  *cuGraphicsResourceSetMapFlags;
1639 extern tcuGraphicsMapResources         *cuGraphicsMapResources;
1640 extern tcuGraphicsUnmapResources       *cuGraphicsUnmapResources;
1641 extern tcuGetExportTable               *cuGetExportTable;
1642 extern tcuCtxSetLimit                  *cuCtxSetLimit;
1643 extern tcuCtxGetLimit                  *cuCtxGetLimit;
1644 
1645 // These functions could be using the CUDA 3.2 interface (_v2)
1646 extern tcuMemcpyHtoD                   *cuMemcpyHtoD;
1647 extern tcuMemcpyDtoH                   *cuMemcpyDtoH;
1648 extern tcuMemcpyDtoD                   *cuMemcpyDtoD;
1649 extern tcuMemcpyDtoA                   *cuMemcpyDtoA;
1650 extern tcuMemcpyAtoD                   *cuMemcpyAtoD;
1651 extern tcuMemcpyHtoA                   *cuMemcpyHtoA;
1652 extern tcuMemcpyAtoH                   *cuMemcpyAtoH;
1653 extern tcuMemcpyAtoA                   *cuMemcpyAtoA;
1654 extern tcuMemcpy2D                     *cuMemcpy2D;
1655 extern tcuMemcpy2DUnaligned            *cuMemcpy2DUnaligned;
1656 extern tcuMemcpy3D                     *cuMemcpy3D;
1657 extern tcuMemcpyHtoDAsync              *cuMemcpyHtoDAsync;
1658 extern tcuMemcpyDtoHAsync              *cuMemcpyDtoHAsync;
1659 extern tcuMemcpyDtoDAsync              *cuMemcpyDtoDAsync;
1660 extern tcuMemcpyHtoAAsync              *cuMemcpyHtoAAsync;
1661 extern tcuMemcpyAtoHAsync              *cuMemcpyAtoHAsync;
1662 extern tcuMemcpy2DAsync                *cuMemcpy2DAsync;
1663 extern tcuMemcpy3DAsync                *cuMemcpy3DAsync;
1664 extern tcuMemsetD8                     *cuMemsetD8;
1665 extern tcuMemsetD16                    *cuMemsetD16;
1666 extern tcuMemsetD32                    *cuMemsetD32;
1667 extern tcuMemsetD2D8                   *cuMemsetD2D8;
1668 extern tcuMemsetD2D16                  *cuMemsetD2D16;
1669 extern tcuMemsetD2D32                  *cuMemsetD2D32;
1670 extern tcuArrayCreate                  *cuArrayCreate;
1671 extern tcuArrayGetDescriptor           *cuArrayGetDescriptor;
1672 extern tcuArray3DCreate                *cuArray3DCreate;
1673 extern tcuArray3DGetDescriptor         *cuArray3DGetDescriptor;
1674 extern tcuTexRefSetAddress             *cuTexRefSetAddress;
1675 extern tcuTexRefSetAddress2D           *cuTexRefSetAddress2D;
1676 extern tcuTexRefGetAddress             *cuTexRefGetAddress;
1677 extern tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
1678 
1679 /************************************/
1680 CUresult CUDAAPI cuInit   (unsigned int, int cudaVersion, void *hHandleDriver);
1681 /************************************/
1682 
1683 #ifdef __cplusplus
1684 }
1685 #endif
1686 
1687 #endif //__cuda_cuda_h__
1688