1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
2 /* */
3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM */
4 /* Exceptions. */
5 /* See https://llvm.org/LICENSE.txt for license information. */
6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */
7 /* */
8 /******************************************************************************/
9 /* */
10 /* This file implements GPUJIT, a ptx string execution engine for GPU. */
11 /* */
12 /******************************************************************************/
13
14 #include "GPUJIT.h"
15
16 #ifdef HAS_LIBCUDART
17 #include <cuda.h>
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
20
21 #ifdef HAS_LIBOPENCL
22 #ifdef __APPLE__
23 #include <OpenCL/opencl.h>
24 #else
25 #include <CL/cl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
28
29 #include <assert.h>
30 #include <dlfcn.h>
31 #include <stdarg.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36
37 static int DebugMode;
38 static int CacheMode;
39 #define max(x, y) ((x) > (y) ? (x) : (y))
40
41 static PollyGPURuntime Runtime = RUNTIME_NONE;
42
debug_print(const char * format,...)43 static void debug_print(const char *format, ...) {
44 if (!DebugMode)
45 return;
46
47 va_list args;
48 va_start(args, format);
49 vfprintf(stderr, format, args);
50 va_end(args);
51 }
52 #define dump_function() debug_print("-> %s\n", __func__)
53
54 #define KERNEL_CACHE_SIZE 10
55
56 static void err_runtime() __attribute__((noreturn));
err_runtime()57 static void err_runtime() {
58 fprintf(stderr, "Runtime not correctly initialized.\n");
59 exit(-1);
60 }
61
62 struct PollyGPUContextT {
63 void *Context;
64 };
65
66 struct PollyGPUFunctionT {
67 void *Kernel;
68 };
69
70 struct PollyGPUDevicePtrT {
71 void *DevicePtr;
72 };
73
74 /******************************************************************************/
75 /* OpenCL */
76 /******************************************************************************/
77 #ifdef HAS_LIBOPENCL
78
79 struct OpenCLContextT {
80 cl_context Context;
81 cl_command_queue CommandQueue;
82 };
83
84 struct OpenCLKernelT {
85 cl_kernel Kernel;
86 cl_program Program;
87 const char *BinaryString;
88 };
89
90 struct OpenCLDevicePtrT {
91 cl_mem MemObj;
92 };
93
94 /* Dynamic library handles for the OpenCL runtime library. */
95 static void *HandleOpenCL;
96 static void *HandleOpenCLBeignet;
97
98 /* Type-defines of function pointer to OpenCL Runtime API. */
99 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
100 cl_platform_id *Platforms,
101 cl_uint *NumPlatforms);
102 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
103
104 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
105 cl_device_type DeviceType,
106 cl_uint NumEntries, cl_device_id *Devices,
107 cl_uint *NumDevices);
108 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
109
110 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
111 cl_device_info ParamName,
112 size_t ParamValueSize, void *ParamValue,
113 size_t *ParamValueSizeRet);
114 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
115
116 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
117 size_t ParamValueSize, void *ParamValue,
118 size_t *ParamValueSizeRet);
119 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
120
121 typedef cl_context clCreateContextFcnTy(
122 const cl_context_properties *Properties, cl_uint NumDevices,
123 const cl_device_id *Devices,
124 void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
125 size_t CB, void *UserData),
126 void *UserData, cl_int *ErrcodeRet);
127 static clCreateContextFcnTy *clCreateContextFcnPtr;
128
129 typedef cl_command_queue
130 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
131 cl_command_queue_properties Properties,
132 cl_int *ErrcodeRet);
133 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
134
135 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
136 size_t Size, void *HostPtr,
137 cl_int *ErrcodeRet);
138 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
139
140 typedef cl_int
141 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
142 cl_bool BlockingWrite, size_t Offset, size_t Size,
143 const void *Ptr, cl_uint NumEventsInWaitList,
144 const cl_event *EventWaitList, cl_event *Event);
145 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
146
147 typedef cl_program
148 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
149 const cl_device_id *DeviceList,
150 const char *Filename, cl_int *ErrcodeRet);
151 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
152
153 typedef cl_program clCreateProgramWithBinaryFcnTy(
154 cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
155 const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
156 cl_int *ErrcodeRet);
157 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
158
159 typedef cl_int clBuildProgramFcnTy(
160 cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
161 const char *Options,
162 void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
163 void *UserData);
164 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
165
166 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
167 const char *KernelName,
168 cl_int *ErrcodeRet);
169 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
170
171 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
172 size_t ArgSize, const void *ArgValue);
173 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
174
175 typedef cl_int clEnqueueNDRangeKernelFcnTy(
176 cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
177 const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
178 const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
179 const cl_event *EventWaitList, cl_event *Event);
180 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
181
182 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
183 cl_mem Buffer, cl_bool BlockingRead,
184 size_t Offset, size_t Size, void *Ptr,
185 cl_uint NumEventsInWaitList,
186 const cl_event *EventWaitList,
187 cl_event *Event);
188 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
189
190 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
191 static clFlushFcnTy *clFlushFcnPtr;
192
193 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
194 static clFinishFcnTy *clFinishFcnPtr;
195
196 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
197 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
198
199 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
200 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
201
202 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
203 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
204
205 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
206 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
207
208 typedef cl_int clReleaseContextFcnTy(cl_context Context);
209 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
210
getAPIHandleCL(void * Handle,const char * FuncName)211 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
212 char *Err;
213 void *FuncPtr;
214 dlerror();
215 FuncPtr = dlsym(Handle, FuncName);
216 if ((Err = dlerror()) != 0) {
217 fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
218 return 0;
219 }
220 return FuncPtr;
221 }
222
initialDeviceAPILibrariesCL()223 static int initialDeviceAPILibrariesCL() {
224 HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
225 HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
226 if (!HandleOpenCL) {
227 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
228 return 0;
229 }
230 return 1;
231 }
232
233 /* Get function pointer to OpenCL Runtime API.
234 *
235 * Note that compilers conforming to the ISO C standard are required to
236 * generate a warning if a conversion from a void * pointer to a function
237 * pointer is attempted as in the following statements. The warning
238 * of this kind of cast may not be emitted by clang and new versions of gcc
239 * as it is valid on POSIX 2008. For compilers required to generate a warning,
240 * we temporarily disable -Wpedantic, to avoid bloating the output with
241 * unnecessary warnings.
242 *
243 * Reference:
244 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
245 */
246 #pragma GCC diagnostic push
247 #pragma GCC diagnostic ignored "-Wpedantic"
initialDeviceAPIsCL()248 static int initialDeviceAPIsCL() {
249 if (initialDeviceAPILibrariesCL() == 0)
250 return 0;
251
252 // FIXME: We are now always selecting the Intel Beignet driver if it is
253 // available on the system, instead of a possible NVIDIA or AMD OpenCL
254 // API. This selection should occurr based on the target architecture
255 // chosen when compiling.
256 void *Handle =
257 (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
258
259 clGetPlatformIDsFcnPtr =
260 (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
261
262 clGetDeviceIDsFcnPtr =
263 (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
264
265 clGetDeviceInfoFcnPtr =
266 (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
267
268 clGetKernelInfoFcnPtr =
269 (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
270
271 clCreateContextFcnPtr =
272 (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
273
274 clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
275 Handle, "clCreateCommandQueue");
276
277 clCreateBufferFcnPtr =
278 (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
279
280 clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
281 Handle, "clEnqueueWriteBuffer");
282
283 if (HandleOpenCLBeignet)
284 clCreateProgramWithLLVMIntelFcnPtr =
285 (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
286 Handle, "clCreateProgramWithLLVMIntel");
287
288 clCreateProgramWithBinaryFcnPtr =
289 (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
290 Handle, "clCreateProgramWithBinary");
291
292 clBuildProgramFcnPtr =
293 (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
294
295 clCreateKernelFcnPtr =
296 (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
297
298 clSetKernelArgFcnPtr =
299 (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
300
301 clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
302 Handle, "clEnqueueNDRangeKernel");
303
304 clEnqueueReadBufferFcnPtr =
305 (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
306
307 clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
308
309 clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
310
311 clReleaseKernelFcnPtr =
312 (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
313
314 clReleaseProgramFcnPtr =
315 (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
316
317 clReleaseMemObjectFcnPtr =
318 (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
319
320 clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
321 Handle, "clReleaseCommandQueue");
322
323 clReleaseContextFcnPtr =
324 (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
325
326 return 1;
327 }
328 #pragma GCC diagnostic pop
329
330 /* Context and Device. */
331 static PollyGPUContext *GlobalContext = NULL;
332 static cl_device_id GlobalDeviceID = NULL;
333
334 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
335 static void printOpenCLError(int Error);
336
checkOpenCLError(int Ret,const char * format,...)337 static void checkOpenCLError(int Ret, const char *format, ...) {
338 if (Ret == CL_SUCCESS)
339 return;
340
341 printOpenCLError(Ret);
342 va_list args;
343 va_start(args, format);
344 vfprintf(stderr, format, args);
345 va_end(args);
346 exit(-1);
347 }
348
initContextCL()349 static PollyGPUContext *initContextCL() {
350 dump_function();
351
352 PollyGPUContext *Context;
353
354 cl_platform_id PlatformID = NULL;
355 cl_device_id DeviceID = NULL;
356 cl_uint NumDevicesRet;
357 cl_int Ret;
358
359 char DeviceRevision[256];
360 char DeviceName[256];
361 size_t DeviceRevisionRetSize, DeviceNameRetSize;
362
363 static __thread PollyGPUContext *CurrentContext = NULL;
364
365 if (CurrentContext)
366 return CurrentContext;
367
368 /* Get API handles. */
369 if (initialDeviceAPIsCL() == 0) {
370 fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
371 exit(-1);
372 }
373
374 /* Get number of devices that support OpenCL. */
375 static const int NumberOfPlatforms = 1;
376 Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
377 checkOpenCLError(Ret, "Failed to get platform IDs.\n");
378 // TODO: Extend to CL_DEVICE_TYPE_ALL?
379 static const int NumberOfDevices = 1;
380 Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
381 &DeviceID, &NumDevicesRet);
382 checkOpenCLError(Ret, "Failed to get device IDs.\n");
383
384 GlobalDeviceID = DeviceID;
385 if (NumDevicesRet == 0) {
386 fprintf(stderr, "There is no device supporting OpenCL.\n");
387 exit(-1);
388 }
389
390 /* Get device revision. */
391 Ret =
392 clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
393 DeviceRevision, &DeviceRevisionRetSize);
394 checkOpenCLError(Ret, "Failed to fetch device revision.\n");
395
396 /* Get device name. */
397 Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
398 DeviceName, &DeviceNameRetSize);
399 checkOpenCLError(Ret, "Failed to fetch device name.\n");
400
401 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
402
403 /* Create context on the device. */
404 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
405 if (Context == 0) {
406 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
407 exit(-1);
408 }
409 Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
410 if (Context->Context == 0) {
411 fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
412 exit(-1);
413 }
414 ((OpenCLContext *)Context->Context)->Context =
415 clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
416 checkOpenCLError(Ret, "Failed to create context.\n");
417
418 static const int ExtraProperties = 0;
419 ((OpenCLContext *)Context->Context)->CommandQueue =
420 clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
421 DeviceID, ExtraProperties, &Ret);
422 checkOpenCLError(Ret, "Failed to create command queue.\n");
423
424 if (CacheMode)
425 CurrentContext = Context;
426
427 GlobalContext = Context;
428 return Context;
429 }
430
freeKernelCL(PollyGPUFunction * Kernel)431 static void freeKernelCL(PollyGPUFunction *Kernel) {
432 dump_function();
433
434 if (CacheMode)
435 return;
436
437 if (!GlobalContext) {
438 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
439 exit(-1);
440 }
441
442 cl_int Ret;
443 Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
444 checkOpenCLError(Ret, "Failed to flush command queue.\n");
445 Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
446 checkOpenCLError(Ret, "Failed to finish command queue.\n");
447
448 if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
449 cl_int Ret =
450 clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
451 checkOpenCLError(Ret, "Failed to release kernel.\n");
452 }
453
454 if (((OpenCLKernel *)Kernel->Kernel)->Program) {
455 cl_int Ret =
456 clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
457 checkOpenCLError(Ret, "Failed to release program.\n");
458 }
459
460 if (Kernel->Kernel)
461 free((OpenCLKernel *)Kernel->Kernel);
462
463 if (Kernel)
464 free(Kernel);
465 }
466
getKernelCL(const char * BinaryBuffer,const char * KernelName)467 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
468 const char *KernelName) {
469 dump_function();
470
471 if (!GlobalContext) {
472 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
473 exit(-1);
474 }
475
476 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
477 static __thread int NextCacheItem = 0;
478
479 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
480 // We exploit here the property that all Polly-ACC kernels are allocated
481 // as global constants, hence a pointer comparision is sufficient to
482 // determin equality.
483 if (KernelCache[i] &&
484 ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
485 BinaryBuffer) {
486 debug_print(" -> using cached kernel\n");
487 return KernelCache[i];
488 }
489 }
490
491 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
492 if (Function == 0) {
493 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
494 exit(-1);
495 }
496 Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
497 if (Function->Kernel == 0) {
498 fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
499 exit(-1);
500 }
501
502 if (!GlobalDeviceID) {
503 fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
504 exit(-1);
505 }
506
507 cl_int Ret;
508
509 if (HandleOpenCLBeignet) {
510 // This is a workaround, since clCreateProgramWithLLVMIntel only
511 // accepts a filename to a valid llvm-ir file as an argument, instead
512 // of accepting the BinaryBuffer directly.
513 char FileName[] = "/tmp/polly_kernelXXXXXX";
514 int File = mkstemp(FileName);
515 write(File, BinaryBuffer, strlen(BinaryBuffer));
516
517 ((OpenCLKernel *)Function->Kernel)->Program =
518 clCreateProgramWithLLVMIntelFcnPtr(
519 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
520 &GlobalDeviceID, FileName, &Ret);
521 checkOpenCLError(Ret, "Failed to create program from llvm.\n");
522 close(File);
523 unlink(FileName);
524 } else {
525 size_t BinarySize = strlen(BinaryBuffer);
526 ((OpenCLKernel *)Function->Kernel)->Program =
527 clCreateProgramWithBinaryFcnPtr(
528 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
529 &GlobalDeviceID, (const size_t *)&BinarySize,
530 (const unsigned char **)&BinaryBuffer, NULL, &Ret);
531 checkOpenCLError(Ret, "Failed to create program from binary.\n");
532 }
533
534 Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
535 &GlobalDeviceID, NULL, NULL, NULL);
536 checkOpenCLError(Ret, "Failed to build program.\n");
537
538 ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
539 ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
540 checkOpenCLError(Ret, "Failed to create kernel.\n");
541
542 ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
543
544 if (CacheMode) {
545 if (KernelCache[NextCacheItem])
546 freeKernelCL(KernelCache[NextCacheItem]);
547
548 KernelCache[NextCacheItem] = Function;
549
550 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
551 }
552
553 return Function;
554 }
555
copyFromHostToDeviceCL(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)556 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
557 long MemSize) {
558 dump_function();
559
560 if (!GlobalContext) {
561 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
562 exit(-1);
563 }
564
565 cl_int Ret;
566 Ret = clEnqueueWriteBufferFcnPtr(
567 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
568 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
569 HostData, 0, NULL, NULL);
570 checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
571 }
572
copyFromDeviceToHostCL(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)573 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
574 long MemSize) {
575 dump_function();
576
577 if (!GlobalContext) {
578 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
579 exit(-1);
580 }
581
582 cl_int Ret;
583 Ret = clEnqueueReadBufferFcnPtr(
584 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
585 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
586 HostData, 0, NULL, NULL);
587 checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
588 }
589
launchKernelCL(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)590 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
591 unsigned int GridDimY, unsigned int BlockDimX,
592 unsigned int BlockDimY, unsigned int BlockDimZ,
593 void **Parameters) {
594 dump_function();
595
596 cl_int Ret;
597 cl_uint NumArgs;
598
599 if (!GlobalContext) {
600 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
601 exit(-1);
602 }
603
604 OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
605 Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
606 sizeof(cl_uint), &NumArgs, NULL);
607 checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
608
609 /* Argument sizes are stored at the end of the Parameters array. */
610 for (cl_uint i = 0; i < NumArgs; i++) {
611 Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
612 *((int *)Parameters[NumArgs + i]),
613 (void *)Parameters[i]);
614 checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
615 }
616
617 unsigned int GridDimZ = 1;
618 size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
619 BlockDimZ * GridDimZ};
620 size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
621
622 static const int WorkDim = 3;
623 OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
624 Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
625 WorkDim, NULL, GlobalWorkSize,
626 LocalWorkSize, 0, NULL, NULL);
627 checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
628 }
629
freeDeviceMemoryCL(PollyGPUDevicePtr * Allocation)630 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
631 dump_function();
632
633 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
634 cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
635 checkOpenCLError(Ret, "Failed to free device memory.\n");
636
637 free(DevPtr);
638 free(Allocation);
639 }
640
allocateMemoryForDeviceCL(long MemSize)641 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
642 dump_function();
643
644 if (!GlobalContext) {
645 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
646 exit(-1);
647 }
648
649 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
650 if (DevData == 0) {
651 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
652 exit(-1);
653 }
654 DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
655 if (DevData->DevicePtr == 0) {
656 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
657 exit(-1);
658 }
659
660 cl_int Ret;
661 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
662 clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
663 CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
664 checkOpenCLError(Ret,
665 "Allocate memory for GPU device memory pointer failed.\n");
666
667 return DevData;
668 }
669
getDevicePtrCL(PollyGPUDevicePtr * Allocation)670 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
671 dump_function();
672
673 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
674 return (void *)DevPtr->MemObj;
675 }
676
synchronizeDeviceCL()677 static void synchronizeDeviceCL() {
678 dump_function();
679
680 if (!GlobalContext) {
681 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
682 exit(-1);
683 }
684
685 if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
686 CL_SUCCESS) {
687 fprintf(stderr, "Synchronizing device and host memory failed.\n");
688 exit(-1);
689 }
690 }
691
freeContextCL(PollyGPUContext * Context)692 static void freeContextCL(PollyGPUContext *Context) {
693 dump_function();
694
695 cl_int Ret;
696
697 GlobalContext = NULL;
698
699 OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
700 if (Ctx->CommandQueue) {
701 Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
702 checkOpenCLError(Ret, "Could not release command queue.\n");
703 }
704
705 if (Ctx->Context) {
706 Ret = clReleaseContextFcnPtr(Ctx->Context);
707 checkOpenCLError(Ret, "Could not release context.\n");
708 }
709
710 free(Ctx);
711 free(Context);
712 }
713
printOpenCLError(int Error)714 static void printOpenCLError(int Error) {
715
716 switch (Error) {
717 case CL_SUCCESS:
718 // Success, don't print an error.
719 break;
720
721 // JIT/Runtime errors.
722 case CL_DEVICE_NOT_FOUND:
723 fprintf(stderr, "Device not found.\n");
724 break;
725 case CL_DEVICE_NOT_AVAILABLE:
726 fprintf(stderr, "Device not available.\n");
727 break;
728 case CL_COMPILER_NOT_AVAILABLE:
729 fprintf(stderr, "Compiler not available.\n");
730 break;
731 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
732 fprintf(stderr, "Mem object allocation failure.\n");
733 break;
734 case CL_OUT_OF_RESOURCES:
735 fprintf(stderr, "Out of resources.\n");
736 break;
737 case CL_OUT_OF_HOST_MEMORY:
738 fprintf(stderr, "Out of host memory.\n");
739 break;
740 case CL_PROFILING_INFO_NOT_AVAILABLE:
741 fprintf(stderr, "Profiling info not available.\n");
742 break;
743 case CL_MEM_COPY_OVERLAP:
744 fprintf(stderr, "Mem copy overlap.\n");
745 break;
746 case CL_IMAGE_FORMAT_MISMATCH:
747 fprintf(stderr, "Image format mismatch.\n");
748 break;
749 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
750 fprintf(stderr, "Image format not supported.\n");
751 break;
752 case CL_BUILD_PROGRAM_FAILURE:
753 fprintf(stderr, "Build program failure.\n");
754 break;
755 case CL_MAP_FAILURE:
756 fprintf(stderr, "Map failure.\n");
757 break;
758 case CL_MISALIGNED_SUB_BUFFER_OFFSET:
759 fprintf(stderr, "Misaligned sub buffer offset.\n");
760 break;
761 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
762 fprintf(stderr, "Exec status error for events in wait list.\n");
763 break;
764 case CL_COMPILE_PROGRAM_FAILURE:
765 fprintf(stderr, "Compile program failure.\n");
766 break;
767 case CL_LINKER_NOT_AVAILABLE:
768 fprintf(stderr, "Linker not available.\n");
769 break;
770 case CL_LINK_PROGRAM_FAILURE:
771 fprintf(stderr, "Link program failure.\n");
772 break;
773 case CL_DEVICE_PARTITION_FAILED:
774 fprintf(stderr, "Device partition failed.\n");
775 break;
776 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
777 fprintf(stderr, "Kernel arg info not available.\n");
778 break;
779
780 // Compiler errors.
781 case CL_INVALID_VALUE:
782 fprintf(stderr, "Invalid value.\n");
783 break;
784 case CL_INVALID_DEVICE_TYPE:
785 fprintf(stderr, "Invalid device type.\n");
786 break;
787 case CL_INVALID_PLATFORM:
788 fprintf(stderr, "Invalid platform.\n");
789 break;
790 case CL_INVALID_DEVICE:
791 fprintf(stderr, "Invalid device.\n");
792 break;
793 case CL_INVALID_CONTEXT:
794 fprintf(stderr, "Invalid context.\n");
795 break;
796 case CL_INVALID_QUEUE_PROPERTIES:
797 fprintf(stderr, "Invalid queue properties.\n");
798 break;
799 case CL_INVALID_COMMAND_QUEUE:
800 fprintf(stderr, "Invalid command queue.\n");
801 break;
802 case CL_INVALID_HOST_PTR:
803 fprintf(stderr, "Invalid host pointer.\n");
804 break;
805 case CL_INVALID_MEM_OBJECT:
806 fprintf(stderr, "Invalid memory object.\n");
807 break;
808 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
809 fprintf(stderr, "Invalid image format descriptor.\n");
810 break;
811 case CL_INVALID_IMAGE_SIZE:
812 fprintf(stderr, "Invalid image size.\n");
813 break;
814 case CL_INVALID_SAMPLER:
815 fprintf(stderr, "Invalid sampler.\n");
816 break;
817 case CL_INVALID_BINARY:
818 fprintf(stderr, "Invalid binary.\n");
819 break;
820 case CL_INVALID_BUILD_OPTIONS:
821 fprintf(stderr, "Invalid build options.\n");
822 break;
823 case CL_INVALID_PROGRAM:
824 fprintf(stderr, "Invalid program.\n");
825 break;
826 case CL_INVALID_PROGRAM_EXECUTABLE:
827 fprintf(stderr, "Invalid program executable.\n");
828 break;
829 case CL_INVALID_KERNEL_NAME:
830 fprintf(stderr, "Invalid kernel name.\n");
831 break;
832 case CL_INVALID_KERNEL_DEFINITION:
833 fprintf(stderr, "Invalid kernel definition.\n");
834 break;
835 case CL_INVALID_KERNEL:
836 fprintf(stderr, "Invalid kernel.\n");
837 break;
838 case CL_INVALID_ARG_INDEX:
839 fprintf(stderr, "Invalid arg index.\n");
840 break;
841 case CL_INVALID_ARG_VALUE:
842 fprintf(stderr, "Invalid arg value.\n");
843 break;
844 case CL_INVALID_ARG_SIZE:
845 fprintf(stderr, "Invalid arg size.\n");
846 break;
847 case CL_INVALID_KERNEL_ARGS:
848 fprintf(stderr, "Invalid kernel args.\n");
849 break;
850 case CL_INVALID_WORK_DIMENSION:
851 fprintf(stderr, "Invalid work dimension.\n");
852 break;
853 case CL_INVALID_WORK_GROUP_SIZE:
854 fprintf(stderr, "Invalid work group size.\n");
855 break;
856 case CL_INVALID_WORK_ITEM_SIZE:
857 fprintf(stderr, "Invalid work item size.\n");
858 break;
859 case CL_INVALID_GLOBAL_OFFSET:
860 fprintf(stderr, "Invalid global offset.\n");
861 break;
862 case CL_INVALID_EVENT_WAIT_LIST:
863 fprintf(stderr, "Invalid event wait list.\n");
864 break;
865 case CL_INVALID_EVENT:
866 fprintf(stderr, "Invalid event.\n");
867 break;
868 case CL_INVALID_OPERATION:
869 fprintf(stderr, "Invalid operation.\n");
870 break;
871 case CL_INVALID_GL_OBJECT:
872 fprintf(stderr, "Invalid GL object.\n");
873 break;
874 case CL_INVALID_BUFFER_SIZE:
875 fprintf(stderr, "Invalid buffer size.\n");
876 break;
877 case CL_INVALID_MIP_LEVEL:
878 fprintf(stderr, "Invalid mip level.\n");
879 break;
880 case CL_INVALID_GLOBAL_WORK_SIZE:
881 fprintf(stderr, "Invalid global work size.\n");
882 break;
883 case CL_INVALID_PROPERTY:
884 fprintf(stderr, "Invalid property.\n");
885 break;
886 case CL_INVALID_IMAGE_DESCRIPTOR:
887 fprintf(stderr, "Invalid image descriptor.\n");
888 break;
889 case CL_INVALID_COMPILER_OPTIONS:
890 fprintf(stderr, "Invalid compiler options.\n");
891 break;
892 case CL_INVALID_LINKER_OPTIONS:
893 fprintf(stderr, "Invalid linker options.\n");
894 break;
895 case CL_INVALID_DEVICE_PARTITION_COUNT:
896 fprintf(stderr, "Invalid device partition count.\n");
897 break;
898 case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
899 fprintf(stderr, "Invalid pipe size.\n");
900 break;
901 case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
902 fprintf(stderr, "Invalid device queue.\n");
903 break;
904
905 // NVIDIA specific error.
906 case -9999:
907 fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
908 break;
909
910 default:
911 fprintf(stderr, "Unknown error code!\n");
912 break;
913 }
914 }
915
916 #endif /* HAS_LIBOPENCL */
917 /******************************************************************************/
918 /* CUDA */
919 /******************************************************************************/
920 #ifdef HAS_LIBCUDART
921
922 struct CUDAContextT {
923 CUcontext Cuda;
924 };
925
926 struct CUDAKernelT {
927 CUfunction Cuda;
928 CUmodule CudaModule;
929 const char *BinaryString;
930 };
931
932 struct CUDADevicePtrT {
933 CUdeviceptr Cuda;
934 };
935
936 /* Dynamic library handles for the CUDA and CUDA runtime library. */
937 static void *HandleCuda;
938 static void *HandleCudaRT;
939
940 /* Type-defines of function pointer to CUDA driver APIs. */
941 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
942 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
943
944 typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
945 unsigned int);
946 static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
947
948 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
949 CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
950 unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
951 unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
952 void **KernelParams, void **Extra);
953 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
954
955 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
956 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
957
958 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
959 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
960
961 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
962 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
963
964 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
965 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
966
967 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
968 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
969
970 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
971 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
972
973 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
974 static CuInitFcnTy *CuInitFcnPtr;
975
976 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
977 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
978
979 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
980 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
981
982 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
983 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
984
985 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
986 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
987
988 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
989 unsigned int, CUjit_option *,
990 void **);
991 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
992
993 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
994 const void *Image);
995 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
996
997 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
998 const char *);
999 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
1000
1001 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
1002 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
1003
1004 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1005 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1006
1007 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1008 CUjitInputType Type, void *Data,
1009 size_t Size, const char *Name,
1010 unsigned int NumOptions,
1011 CUjit_option *Options,
1012 void **OptionValues);
1013 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1014
1015 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1016 CUjit_option *Options,
1017 void **OptionValues,
1018 CUlinkState *StateOut);
1019 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1020
1021 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1022 size_t *SizeOut);
1023 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1024
1025 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1026 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1027
1028 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1029 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1030
1031 /* Type-defines of function pointer ot CUDA runtime APIs. */
1032 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1033 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1034
getAPIHandleCUDA(void * Handle,const char * FuncName)1035 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1036 char *Err;
1037 void *FuncPtr;
1038 dlerror();
1039 FuncPtr = dlsym(Handle, FuncName);
1040 if ((Err = dlerror()) != 0) {
1041 fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1042 return 0;
1043 }
1044 return FuncPtr;
1045 }
1046
initialDeviceAPILibrariesCUDA()1047 static int initialDeviceAPILibrariesCUDA() {
1048 HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1049 if (!HandleCuda) {
1050 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1051 return 0;
1052 }
1053
1054 HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1055 if (!HandleCudaRT) {
1056 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1057 return 0;
1058 }
1059
1060 return 1;
1061 }
1062
1063 /* Get function pointer to CUDA Driver APIs.
1064 *
1065 * Note that compilers conforming to the ISO C standard are required to
1066 * generate a warning if a conversion from a void * pointer to a function
1067 * pointer is attempted as in the following statements. The warning
1068 * of this kind of cast may not be emitted by clang and new versions of gcc
1069 * as it is valid on POSIX 2008. For compilers required to generate a warning,
1070 * we temporarily disable -Wpedantic, to avoid bloating the output with
1071 * unnecessary warnings.
1072 *
1073 * Reference:
1074 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1075 */
1076 #pragma GCC diagnostic push
1077 #pragma GCC diagnostic ignored "-Wpedantic"
initialDeviceAPIsCUDA()1078 static int initialDeviceAPIsCUDA() {
1079 if (initialDeviceAPILibrariesCUDA() == 0)
1080 return 0;
1081
1082 CuLaunchKernelFcnPtr =
1083 (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1084
1085 CuMemAllocFcnPtr =
1086 (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1087
1088 CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
1089 HandleCuda, "cuMemAllocManaged");
1090
1091 CuMemFreeFcnPtr =
1092 (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1093
1094 CuMemcpyDtoHFcnPtr =
1095 (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1096
1097 CuMemcpyHtoDFcnPtr =
1098 (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1099
1100 CuModuleUnloadFcnPtr =
1101 (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1102
1103 CuProfilerStopFcnPtr =
1104 (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1105
1106 CuCtxDestroyFcnPtr =
1107 (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1108
1109 CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1110
1111 CuDeviceGetCountFcnPtr =
1112 (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1113
1114 CuDeviceGetFcnPtr =
1115 (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1116
1117 CuCtxCreateFcnPtr =
1118 (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1119
1120 CuCtxGetCurrentFcnPtr =
1121 (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1122
1123 CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1124 HandleCuda, "cuModuleLoadDataEx");
1125
1126 CuModuleLoadDataFcnPtr =
1127 (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1128
1129 CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1130 HandleCuda, "cuModuleGetFunction");
1131
1132 CuDeviceComputeCapabilityFcnPtr =
1133 (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1134 HandleCuda, "cuDeviceComputeCapability");
1135
1136 CuDeviceGetNameFcnPtr =
1137 (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1138
1139 CuLinkAddDataFcnPtr =
1140 (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1141
1142 CuLinkCreateFcnPtr =
1143 (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1144
1145 CuLinkCompleteFcnPtr =
1146 (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1147
1148 CuLinkDestroyFcnPtr =
1149 (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1150
1151 CuCtxSynchronizeFcnPtr =
1152 (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1153
1154 /* Get function pointer to CUDA Runtime APIs. */
1155 CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1156 HandleCudaRT, "cudaThreadSynchronize");
1157
1158 return 1;
1159 }
1160 #pragma GCC diagnostic pop
1161
initContextCUDA()1162 static PollyGPUContext *initContextCUDA() {
1163 dump_function();
1164 PollyGPUContext *Context;
1165 CUdevice Device;
1166
1167 int Major = 0, Minor = 0, DeviceID = 0;
1168 char DeviceName[256];
1169 int DeviceCount = 0;
1170
1171 static __thread PollyGPUContext *CurrentContext = NULL;
1172
1173 if (CurrentContext)
1174 return CurrentContext;
1175
1176 /* Get API handles. */
1177 if (initialDeviceAPIsCUDA() == 0) {
1178 fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1179 exit(-1);
1180 }
1181
1182 if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1183 fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1184 exit(-1);
1185 }
1186
1187 /* Get number of devices that supports CUDA. */
1188 CuDeviceGetCountFcnPtr(&DeviceCount);
1189 if (DeviceCount == 0) {
1190 fprintf(stderr, "There is no device supporting CUDA.\n");
1191 exit(-1);
1192 }
1193
1194 CuDeviceGetFcnPtr(&Device, 0);
1195
1196 /* Get compute capabilities and the device name. */
1197 CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1198 CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1199 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1200
1201 /* Create context on the device. */
1202 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1203 if (Context == 0) {
1204 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1205 exit(-1);
1206 }
1207 Context->Context = malloc(sizeof(CUDAContext));
1208 if (Context->Context == 0) {
1209 fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1210 exit(-1);
1211 }
1212
1213 // In cases where managed memory is used, it is quite likely that
1214 // `cudaMallocManaged` / `polly_mallocManaged` was called before
1215 // `polly_initContext` was called.
1216 //
1217 // If `polly_initContext` calls `CuCtxCreate` when there already was a
1218 // pre-existing context created by the runtime API, this causes code running
1219 // on P100 to hang. So, we query for a pre-existing context to try and use.
1220 // If there is no pre-existing context, we create a new context
1221
1222 // The possible pre-existing context from previous runtime API calls.
1223 CUcontext MaybeRuntimeAPIContext;
1224 if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1225 fprintf(stderr, "cuCtxGetCurrent failed.\n");
1226 exit(-1);
1227 }
1228
1229 // There was no previous context, initialise it.
1230 if (MaybeRuntimeAPIContext == NULL) {
1231 if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1232 Device) != CUDA_SUCCESS) {
1233 fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1234 exit(-1);
1235 }
1236 } else {
1237 ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1238 }
1239
1240 if (CacheMode)
1241 CurrentContext = Context;
1242
1243 return Context;
1244 }
1245
freeKernelCUDA(PollyGPUFunction * Kernel)1246 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1247 dump_function();
1248
1249 if (CacheMode)
1250 return;
1251
1252 if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1253 CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1254
1255 if (Kernel->Kernel)
1256 free((CUDAKernel *)Kernel->Kernel);
1257
1258 if (Kernel)
1259 free(Kernel);
1260 }
1261
getKernelCUDA(const char * BinaryBuffer,const char * KernelName)1262 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1263 const char *KernelName) {
1264 dump_function();
1265
1266 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1267 static __thread int NextCacheItem = 0;
1268
1269 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1270 // We exploit here the property that all Polly-ACC kernels are allocated
1271 // as global constants, hence a pointer comparision is sufficient to
1272 // determin equality.
1273 if (KernelCache[i] &&
1274 ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1275 debug_print(" -> using cached kernel\n");
1276 return KernelCache[i];
1277 }
1278 }
1279
1280 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1281 if (Function == 0) {
1282 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1283 exit(-1);
1284 }
1285 Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1286 if (Function->Kernel == 0) {
1287 fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1288 exit(-1);
1289 }
1290
1291 CUresult Res;
1292 CUlinkState LState;
1293 CUjit_option Options[6];
1294 void *OptionVals[6];
1295 float Walltime = 0;
1296 unsigned long LogSize = 8192;
1297 char ErrorLog[8192], InfoLog[8192];
1298 void *CuOut;
1299 size_t OutSize;
1300
1301 // Setup linker options
1302 // Return walltime from JIT compilation
1303 Options[0] = CU_JIT_WALL_TIME;
1304 OptionVals[0] = (void *)&Walltime;
1305 // Pass a buffer for info messages
1306 Options[1] = CU_JIT_INFO_LOG_BUFFER;
1307 OptionVals[1] = (void *)InfoLog;
1308 // Pass the size of the info buffer
1309 Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1310 OptionVals[2] = (void *)LogSize;
1311 // Pass a buffer for error message
1312 Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1313 OptionVals[3] = (void *)ErrorLog;
1314 // Pass the size of the error buffer
1315 Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1316 OptionVals[4] = (void *)LogSize;
1317 // Make the linker verbose
1318 Options[5] = CU_JIT_LOG_VERBOSE;
1319 OptionVals[5] = (void *)1;
1320
1321 memset(ErrorLog, 0, sizeof(ErrorLog));
1322
1323 CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1324 Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1325 strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1326 if (Res != CUDA_SUCCESS) {
1327 fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1328 exit(-1);
1329 }
1330
1331 Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1332 if (Res != CUDA_SUCCESS) {
1333 fprintf(stderr, "Complete ptx linker step failed.\n");
1334 fprintf(stderr, "\n%s\n", ErrorLog);
1335 exit(-1);
1336 }
1337
1338 debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1339 InfoLog);
1340
1341 Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1342 CuOut);
1343 if (Res != CUDA_SUCCESS) {
1344 fprintf(stderr, "Loading ptx assembly text failed.\n");
1345 exit(-1);
1346 }
1347
1348 Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1349 ((CUDAKernel *)Function->Kernel)->CudaModule,
1350 KernelName);
1351 if (Res != CUDA_SUCCESS) {
1352 fprintf(stderr, "Loading kernel function failed.\n");
1353 exit(-1);
1354 }
1355
1356 CuLinkDestroyFcnPtr(LState);
1357
1358 ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1359
1360 if (CacheMode) {
1361 if (KernelCache[NextCacheItem])
1362 freeKernelCUDA(KernelCache[NextCacheItem]);
1363
1364 KernelCache[NextCacheItem] = Function;
1365
1366 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1367 }
1368
1369 return Function;
1370 }
1371
synchronizeDeviceCUDA()1372 static void synchronizeDeviceCUDA() {
1373 dump_function();
1374 if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1375 fprintf(stderr, "Synchronizing device and host memory failed.\n");
1376 exit(-1);
1377 }
1378 }
1379
copyFromHostToDeviceCUDA(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)1380 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1381 long MemSize) {
1382 dump_function();
1383
1384 CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1385 CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1386 }
1387
copyFromDeviceToHostCUDA(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)1388 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1389 long MemSize) {
1390 dump_function();
1391
1392 if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1393 MemSize) != CUDA_SUCCESS) {
1394 fprintf(stderr, "Copying results from device to host memory failed.\n");
1395 exit(-1);
1396 }
1397 }
1398
launchKernelCUDA(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)1399 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1400 unsigned int GridDimY, unsigned int BlockDimX,
1401 unsigned int BlockDimY, unsigned int BlockDimZ,
1402 void **Parameters) {
1403 dump_function();
1404
1405 unsigned GridDimZ = 1;
1406 unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1407 CUstream Stream = 0;
1408 void **Extra = 0;
1409
1410 CUresult Res;
1411 Res =
1412 CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1413 GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1414 SharedMemBytes, Stream, Parameters, Extra);
1415 if (Res != CUDA_SUCCESS) {
1416 fprintf(stderr, "Launching CUDA kernel failed.\n");
1417 exit(-1);
1418 }
1419 }
1420
1421 // Maximum number of managed memory pointers.
1422 #define DEFAULT_MAX_POINTERS 4000
1423 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1424 void **g_managedptrs;
1425 unsigned long long g_nmanagedptrs = 0;
1426 unsigned long long g_maxmanagedptrs = 0;
1427
initManagedPtrsBuffer()1428 __attribute__((constructor)) static void initManagedPtrsBuffer() {
1429 g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
1430 const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
1431 if (maxManagedPointersString)
1432 g_maxmanagedptrs = atoll(maxManagedPointersString);
1433
1434 g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
1435 }
1436
1437 // Add a pointer as being allocated by cuMallocManaged
addManagedPtr(void * mem)1438 void addManagedPtr(void *mem) {
1439 assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
1440 assert(g_nmanagedptrs < g_maxmanagedptrs &&
1441 "We have hit the maximum number of "
1442 "managed pointers allowed. Set the "
1443 "POLLY_MAX_MANAGED_POINTERS environment variable. ");
1444 g_managedptrs[g_nmanagedptrs++] = mem;
1445 }
1446
isManagedPtr(void * mem)1447 int isManagedPtr(void *mem) {
1448 for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
1449 if (g_managedptrs[i] == mem)
1450 return 1;
1451 }
1452 return 0;
1453 }
1454
freeManagedCUDA(void * mem)1455 void freeManagedCUDA(void *mem) {
1456 dump_function();
1457
1458 // In a real-world program this was used (COSMO), there were more `free`
1459 // calls in the original source than `malloc` calls. Hence, replacing all
1460 // `free`s with `cudaFree` does not work, since we would try to free
1461 // 'illegal' memory.
1462 // As a quick fix, we keep a free list and check if `mem` is a managed memory
1463 // pointer. If it is, we call `cudaFree`.
1464 // If not, we pass it along to the underlying allocator.
1465 // This is a hack, and can be removed if the underlying issue is fixed.
1466 if (isManagedPtr(mem)) {
1467 if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
1468 fprintf(stderr, "cudaFree failed.\n");
1469 exit(-1);
1470 }
1471 return;
1472 } else {
1473 free(mem);
1474 }
1475 }
1476
mallocManagedCUDA(size_t size)1477 void *mallocManagedCUDA(size_t size) {
1478 // Note: [Size 0 allocations]
1479 // Sometimes, some runtime computation of size could create a size of 0
1480 // for an allocation. In these cases, we do not wish to fail.
1481 // The CUDA API fails on size 0 allocations.
1482 // So, we allocate size a minimum of size 1.
1483 if (!size && DebugMode)
1484 fprintf(stderr, "cudaMallocManaged called with size 0. "
1485 "Promoting to size 1");
1486 size = max(size, 1);
1487 PollyGPUContext *_ = polly_initContextCUDA();
1488 assert(_ && "polly_initContextCUDA failed");
1489
1490 void *newMemPtr;
1491 const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
1492 CU_MEM_ATTACH_GLOBAL);
1493 if (Res != CUDA_SUCCESS) {
1494 fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1495 exit(-1);
1496 }
1497 addManagedPtr(newMemPtr);
1498 return newMemPtr;
1499 }
1500
freeDeviceMemoryCUDA(PollyGPUDevicePtr * Allocation)1501 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1502 dump_function();
1503 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1504 CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1505 free(DevPtr);
1506 free(Allocation);
1507 }
1508
allocateMemoryForDeviceCUDA(long MemSize)1509 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1510 if (!MemSize && DebugMode)
1511 fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
1512 "Promoting to size 1");
1513 // see: [Size 0 allocations]
1514 MemSize = max(MemSize, 1);
1515 dump_function();
1516
1517 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1518 if (DevData == 0) {
1519 fprintf(stderr,
1520 "Allocate memory for GPU device memory pointer failed."
1521 " Line: %d | Size: %ld\n",
1522 __LINE__, MemSize);
1523 exit(-1);
1524 }
1525 DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1526 if (DevData->DevicePtr == 0) {
1527 fprintf(stderr,
1528 "Allocate memory for GPU device memory pointer failed."
1529 " Line: %d | Size: %ld\n",
1530 __LINE__, MemSize);
1531 exit(-1);
1532 }
1533
1534 CUresult Res =
1535 CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1536
1537 if (Res != CUDA_SUCCESS) {
1538 fprintf(stderr,
1539 "Allocate memory for GPU device memory pointer failed."
1540 " Line: %d | Size: %ld\n",
1541 __LINE__, MemSize);
1542 exit(-1);
1543 }
1544
1545 return DevData;
1546 }
1547
getDevicePtrCUDA(PollyGPUDevicePtr * Allocation)1548 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1549 dump_function();
1550
1551 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1552 return (void *)DevPtr->Cuda;
1553 }
1554
freeContextCUDA(PollyGPUContext * Context)1555 static void freeContextCUDA(PollyGPUContext *Context) {
1556 dump_function();
1557
1558 CUDAContext *Ctx = (CUDAContext *)Context->Context;
1559 if (Ctx->Cuda) {
1560 CuProfilerStopFcnPtr();
1561 CuCtxDestroyFcnPtr(Ctx->Cuda);
1562 free(Ctx);
1563 free(Context);
1564 }
1565
1566 dlclose(HandleCuda);
1567 dlclose(HandleCudaRT);
1568 }
1569
1570 #endif /* HAS_LIBCUDART */
1571 /******************************************************************************/
1572 /* API */
1573 /******************************************************************************/
1574
polly_initContext()1575 PollyGPUContext *polly_initContext() {
1576 DebugMode = getenv("POLLY_DEBUG") != 0;
1577 CacheMode = getenv("POLLY_NOCACHE") == 0;
1578
1579 dump_function();
1580
1581 PollyGPUContext *Context;
1582
1583 switch (Runtime) {
1584 #ifdef HAS_LIBCUDART
1585 case RUNTIME_CUDA:
1586 Context = initContextCUDA();
1587 break;
1588 #endif /* HAS_LIBCUDART */
1589 #ifdef HAS_LIBOPENCL
1590 case RUNTIME_CL:
1591 Context = initContextCL();
1592 break;
1593 #endif /* HAS_LIBOPENCL */
1594 default:
1595 err_runtime();
1596 }
1597
1598 return Context;
1599 }
1600
polly_freeKernel(PollyGPUFunction * Kernel)1601 void polly_freeKernel(PollyGPUFunction *Kernel) {
1602 dump_function();
1603
1604 switch (Runtime) {
1605 #ifdef HAS_LIBCUDART
1606 case RUNTIME_CUDA:
1607 freeKernelCUDA(Kernel);
1608 break;
1609 #endif /* HAS_LIBCUDART */
1610 #ifdef HAS_LIBOPENCL
1611 case RUNTIME_CL:
1612 freeKernelCL(Kernel);
1613 break;
1614 #endif /* HAS_LIBOPENCL */
1615 default:
1616 err_runtime();
1617 }
1618 }
1619
polly_getKernel(const char * BinaryBuffer,const char * KernelName)1620 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1621 const char *KernelName) {
1622 dump_function();
1623
1624 PollyGPUFunction *Function;
1625
1626 switch (Runtime) {
1627 #ifdef HAS_LIBCUDART
1628 case RUNTIME_CUDA:
1629 Function = getKernelCUDA(BinaryBuffer, KernelName);
1630 break;
1631 #endif /* HAS_LIBCUDART */
1632 #ifdef HAS_LIBOPENCL
1633 case RUNTIME_CL:
1634 Function = getKernelCL(BinaryBuffer, KernelName);
1635 break;
1636 #endif /* HAS_LIBOPENCL */
1637 default:
1638 err_runtime();
1639 }
1640
1641 return Function;
1642 }
1643
polly_copyFromHostToDevice(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)1644 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1645 long MemSize) {
1646 dump_function();
1647
1648 switch (Runtime) {
1649 #ifdef HAS_LIBCUDART
1650 case RUNTIME_CUDA:
1651 copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1652 break;
1653 #endif /* HAS_LIBCUDART */
1654 #ifdef HAS_LIBOPENCL
1655 case RUNTIME_CL:
1656 copyFromHostToDeviceCL(HostData, DevData, MemSize);
1657 break;
1658 #endif /* HAS_LIBOPENCL */
1659 default:
1660 err_runtime();
1661 }
1662 }
1663
polly_copyFromDeviceToHost(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)1664 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1665 long MemSize) {
1666 dump_function();
1667
1668 switch (Runtime) {
1669 #ifdef HAS_LIBCUDART
1670 case RUNTIME_CUDA:
1671 copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1672 break;
1673 #endif /* HAS_LIBCUDART */
1674 #ifdef HAS_LIBOPENCL
1675 case RUNTIME_CL:
1676 copyFromDeviceToHostCL(DevData, HostData, MemSize);
1677 break;
1678 #endif /* HAS_LIBOPENCL */
1679 default:
1680 err_runtime();
1681 }
1682 }
1683
polly_launchKernel(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)1684 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1685 unsigned int GridDimY, unsigned int BlockDimX,
1686 unsigned int BlockDimY, unsigned int BlockDimZ,
1687 void **Parameters) {
1688 dump_function();
1689
1690 switch (Runtime) {
1691 #ifdef HAS_LIBCUDART
1692 case RUNTIME_CUDA:
1693 launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1694 BlockDimZ, Parameters);
1695 break;
1696 #endif /* HAS_LIBCUDART */
1697 #ifdef HAS_LIBOPENCL
1698 case RUNTIME_CL:
1699 launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1700 Parameters);
1701 break;
1702 #endif /* HAS_LIBOPENCL */
1703 default:
1704 err_runtime();
1705 }
1706 }
1707
polly_freeDeviceMemory(PollyGPUDevicePtr * Allocation)1708 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1709 dump_function();
1710
1711 switch (Runtime) {
1712 #ifdef HAS_LIBCUDART
1713 case RUNTIME_CUDA:
1714 freeDeviceMemoryCUDA(Allocation);
1715 break;
1716 #endif /* HAS_LIBCUDART */
1717 #ifdef HAS_LIBOPENCL
1718 case RUNTIME_CL:
1719 freeDeviceMemoryCL(Allocation);
1720 break;
1721 #endif /* HAS_LIBOPENCL */
1722 default:
1723 err_runtime();
1724 }
1725 }
1726
polly_allocateMemoryForDevice(long MemSize)1727 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1728 dump_function();
1729
1730 PollyGPUDevicePtr *DevData;
1731
1732 switch (Runtime) {
1733 #ifdef HAS_LIBCUDART
1734 case RUNTIME_CUDA:
1735 DevData = allocateMemoryForDeviceCUDA(MemSize);
1736 break;
1737 #endif /* HAS_LIBCUDART */
1738 #ifdef HAS_LIBOPENCL
1739 case RUNTIME_CL:
1740 DevData = allocateMemoryForDeviceCL(MemSize);
1741 break;
1742 #endif /* HAS_LIBOPENCL */
1743 default:
1744 err_runtime();
1745 }
1746
1747 return DevData;
1748 }
1749
polly_getDevicePtr(PollyGPUDevicePtr * Allocation)1750 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1751 dump_function();
1752
1753 void *DevPtr;
1754
1755 switch (Runtime) {
1756 #ifdef HAS_LIBCUDART
1757 case RUNTIME_CUDA:
1758 DevPtr = getDevicePtrCUDA(Allocation);
1759 break;
1760 #endif /* HAS_LIBCUDART */
1761 #ifdef HAS_LIBOPENCL
1762 case RUNTIME_CL:
1763 DevPtr = getDevicePtrCL(Allocation);
1764 break;
1765 #endif /* HAS_LIBOPENCL */
1766 default:
1767 err_runtime();
1768 }
1769
1770 return DevPtr;
1771 }
1772
polly_synchronizeDevice()1773 void polly_synchronizeDevice() {
1774 dump_function();
1775
1776 switch (Runtime) {
1777 #ifdef HAS_LIBCUDART
1778 case RUNTIME_CUDA:
1779 synchronizeDeviceCUDA();
1780 break;
1781 #endif /* HAS_LIBCUDART */
1782 #ifdef HAS_LIBOPENCL
1783 case RUNTIME_CL:
1784 synchronizeDeviceCL();
1785 break;
1786 #endif /* HAS_LIBOPENCL */
1787 default:
1788 err_runtime();
1789 }
1790 }
1791
polly_freeContext(PollyGPUContext * Context)1792 void polly_freeContext(PollyGPUContext *Context) {
1793 dump_function();
1794
1795 if (CacheMode)
1796 return;
1797
1798 switch (Runtime) {
1799 #ifdef HAS_LIBCUDART
1800 case RUNTIME_CUDA:
1801 freeContextCUDA(Context);
1802 break;
1803 #endif /* HAS_LIBCUDART */
1804 #ifdef HAS_LIBOPENCL
1805 case RUNTIME_CL:
1806 freeContextCL(Context);
1807 break;
1808 #endif /* HAS_LIBOPENCL */
1809 default:
1810 err_runtime();
1811 }
1812 }
1813
polly_freeManaged(void * mem)1814 void polly_freeManaged(void *mem) {
1815 dump_function();
1816
1817 #ifdef HAS_LIBCUDART
1818 freeManagedCUDA(mem);
1819 #else
1820 fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1821 exit(-1);
1822 #endif
1823 }
1824
polly_mallocManaged(size_t size)1825 void *polly_mallocManaged(size_t size) {
1826 dump_function();
1827
1828 #ifdef HAS_LIBCUDART
1829 return mallocManagedCUDA(size);
1830 #else
1831 fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1832 exit(-1);
1833 #endif
1834 }
1835
1836 /* Initialize GPUJIT with CUDA as runtime library. */
polly_initContextCUDA()1837 PollyGPUContext *polly_initContextCUDA() {
1838 #ifdef HAS_LIBCUDART
1839 Runtime = RUNTIME_CUDA;
1840 return polly_initContext();
1841 #else
1842 fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1843 exit(-1);
1844 #endif /* HAS_LIBCUDART */
1845 }
1846
1847 /* Initialize GPUJIT with OpenCL as runtime library. */
polly_initContextCL()1848 PollyGPUContext *polly_initContextCL() {
1849 #ifdef HAS_LIBOPENCL
1850 Runtime = RUNTIME_CL;
1851 return polly_initContext();
1852 #else
1853 fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1854 exit(-1);
1855 #endif /* HAS_LIBOPENCL */
1856 }
1857