1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/stat.h>
22 #include <sys/types.h>
23
24 #include "../../test_common/harness/conversions.h"
25 #include "procs.h"
26
27 static const char *import_after_export_aliased_local_kernel =
28 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
29 "%s\n" // optional pragma string
30 "__kernel void test_fn( const __global %s *exportSrc, __global %s "
31 "*exportDst,\n"
32 " const __global %s *importSrc, __global %s "
33 "*importDst,\n"
34 " __local %s *localBuffer, /* there isn't another "
35 "__local %s local buffer since export src and import dst are aliased*/\n"
36 " int exportSrcLocalSize, int "
37 "exportCopiesPerWorkItem,\n"
38 " int importSrcLocalSize, int "
39 "importCopiesPerWorkItem )\n"
40 "{\n"
41 " int i;\n"
42 " int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
43 // Zero the local storage first
44 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
45 " localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
46 "(%s)(%s)0;\n"
47 " }\n"
48 " // no need to set another local buffer values to (%s)(%s)0 since "
49 "export src and import dst are aliased (use the same buffer)\n"
50 // Do this to verify all kernels are done zeroing the local buffer before we
51 // try the export and import
52 " barrier( CLK_LOCAL_MEM_FENCE );\n"
53 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
54 " localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
55 "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
56 " }\n"
57 // Do this to verify all kernels are done copying to the local buffer before
58 // we try the export and import
59 " barrier( CLK_LOCAL_MEM_FENCE );\n"
60 " event_t events;\n"
61 " events = async_work_group_copy((__global "
62 "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
63 "%s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
64 " async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
65 " events = async_work_group_copy( (__local "
66 "%s*)(localBuffer+localImportOffset), (__global const "
67 "%s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
68 "(size_t)importSrcLocalSize, events );\n"
69 // Wait for the export and import to complete, then verify by manually
70 // copying to the dest
71 " wait_group_events( 2, &events );\n"
72 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
73 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
74 "(localBuffer+localImportOffset)[ get_local_id( 0 "
75 ")*importCopiesPerWorkItem+i ];\n"
76 " }\n"
77 "}\n";
78
79 static const char *import_after_export_aliased_global_kernel =
80 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
81 "%s\n" // optional pragma string
82 "__kernel void test_fn( const __global %s *exportSrc, __global %s "
83 "*exportDstImportSrc,\n"
84 " __global %s *importDst, /* there isn't a dedicated "
85 "__global %s buffer for import src since export dst and import src are "
86 "aliased*/\n"
87 " __local %s *exportLocalBuffer, __local %s "
88 "*importLocalBuffer,\n"
89 " int exportSrcLocalSize, int "
90 "exportCopiesPerWorkItem,\n"
91 " int importSrcLocalSize, int "
92 "importCopiesPerWorkItem )\n"
93 "{\n"
94 " int i;\n"
95 // Zero the local storage first
96 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
97 " exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
98 "= (%s)(%s)0;\n"
99 " }\n"
100 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
101 " importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
102 "= (%s)(%s)0;\n"
103 " }\n"
104 // Do this to verify all kernels are done zeroing the local buffer before we
105 // try the export and import
106 " barrier( CLK_LOCAL_MEM_FENCE );\n"
107 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
108 " exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
109 "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
110 " }\n"
111 // Do this to verify all kernels are done copying to the local buffer before
112 // we try the export and import
113 " barrier( CLK_LOCAL_MEM_FENCE );\n"
114 " event_t events;\n"
115 " events = async_work_group_copy((__global "
116 "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
117 "const %s*)exportLocalBuffer, (size_t)exportSrcLocalSize, 0 );\n"
118 " async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
119 " events = async_work_group_copy( (__local %s*)importLocalBuffer, "
120 "(__global const "
121 "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
122 "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
123 "events );\n"
124 // Wait for the export and import to complete, then verify by manually
125 // copying to the dest
126 " wait_group_events( 2, &events );\n"
127 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
128 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
129 "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
130 " }\n"
131 "}\n";
132
133 static const char *import_after_export_aliased_global_and_local_kernel =
134 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
135 "%s\n" // optional pragma string
136 "__kernel void test_fn( const __global %s *exportSrc, __global %s "
137 "*exportDstImportSrc,\n"
138 " __global %s *importDst, /* there isn't a dedicated "
139 "__global %s buffer for import src since export dst and import src are "
140 "aliased*/\n"
141 " __local %s *localBuffer, /* there isn't another "
142 "__local %s local buffer since export src and import dst are aliased*/\n"
143 " int exportSrcLocalSize, int "
144 "exportCopiesPerWorkItem,\n"
145 " int importSrcLocalSize, int "
146 "importCopiesPerWorkItem )\n"
147 "{\n"
148 " int i;\n"
149 " int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
150 // Zero the local storage first
151 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
152 " localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
153 "(%s)(%s)0;\n"
154 " }\n"
155 " // no need to set another local buffer values to (%s)(%s)0 since "
156 "export src and import dst are aliased (use the same buffer)\n"
157 // Do this to verify all kernels are done zeroing the local buffer before we
158 // try the export and import
159 " barrier( CLK_LOCAL_MEM_FENCE );\n"
160 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
161 " localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
162 "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
163 " }\n"
164 // Do this to verify all kernels are done copying to the local buffer before
165 // we try the export and import
166 " barrier( CLK_LOCAL_MEM_FENCE );\n"
167 " event_t events;\n"
168 " events = async_work_group_copy((__global "
169 "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
170 "const %s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
171 " async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
172 "CLK_LOCAL_MEM_FENCE );\n"
173 " events = async_work_group_copy( (__local "
174 "%s*)(localBuffer+localImportOffset), (__global const "
175 "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
176 "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
177 "events );\n"
178 // Wait for the export and import to complete, then verify by manually
179 // copying to the dest
180 " wait_group_events( 2, &events );\n"
181 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
182 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
183 "(localBuffer+localImportOffset)[ get_local_id( 0 "
184 ")*importCopiesPerWorkItem+i ];\n"
185 " }\n"
186 "}\n";
187
188 static const char *export_after_import_aliased_local_kernel =
189 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
190 "%s\n" // optional pragma string
191 "__kernel void test_fn( const __global %s *importSrc, __global %s "
192 "*importDst,\n"
193 " const __global %s *exportDst, /* there isn't a "
194 "dedicated __global %s buffer for export src since the local memory is "
195 "aliased, so the export src is taken from it */\n"
196 " __local %s *localBuffer, /* there isn't another "
197 "__local %s local buffer since import dst and export src are aliased*/\n"
198 " int importSrcLocalSize, int "
199 "importCopiesPerWorkItem,\n"
200 " int exportSrcLocalSize, int "
201 "exportCopiesPerWorkItem )\n"
202 "{\n"
203 " int i;\n"
204 // Zero the local storage first
205 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
206 " localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
207 "(%s)(%s)0;\n"
208 " }\n"
209 " // no need to set another local buffer values to (%s)(%s)0 since "
210 "import dst and export src are aliased (use the same buffer)\n"
211 // Do this to verify all kernels are done zeroing the local buffer before we
212 // try the import and export
213 " barrier( CLK_LOCAL_MEM_FENCE );\n"
214 " event_t events;\n"
215 " events = async_work_group_copy( (__local %s*)localBuffer, (__global "
216 "const %s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
217 "(size_t)importSrcLocalSize, events );\n"
218 " async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
219 " events = async_work_group_copy((__global "
220 "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
221 "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
222 "(size_t)exportSrcLocalSize, 0 );\n"
223 // Wait for the import and export to complete, then verify by manually
224 // copying to the dest
225 " wait_group_events( 2, &events );\n"
226 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
227 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
228 "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
229 " }\n"
230 "}\n";
231
232 static const char *export_after_import_aliased_global_kernel =
233 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
234 "%s\n" // optional pragma string
235 "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
236 "*importDst,\n"
237 " const __global %s *exportSrc,\n"
238 " /* there isn't a dedicated __global %s buffer for "
239 "export dst since import src and export dst are aliased */\n"
240 " __local %s *importLocalBuffer, __local %s "
241 "*exportLocalBuffer,\n"
242 " int importSrcLocalSize, int "
243 "importCopiesPerWorkItem,\n"
244 " int exportSrcLocalSize, int "
245 "exportCopiesPerWorkItem )\n"
246 "{\n"
247 " int i;\n"
248 // Zero the local storage first
249 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
250 " importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
251 "= (%s)(%s)0;\n"
252 " }\n"
253 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
254 " exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
255 "= (%s)(%s)0;\n"
256 " }\n"
257 // Do this to verify all kernels are done zeroing the local buffer before we
258 // try the import and export
259 " barrier( CLK_LOCAL_MEM_FENCE );\n"
260 " for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
261 " exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
262 "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
263 " }\n"
264 // Do this to verify all kernels are done copying to the local buffer before
265 // we try the import and export
266 " barrier( CLK_LOCAL_MEM_FENCE );\n"
267 " event_t events;\n"
268 " events = async_work_group_copy( (__local %s*)importLocalBuffer, "
269 "(__global const "
270 "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
271 "(size_t)importSrcLocalSize, 0 );\n"
272 " async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
273 " events = async_work_group_copy((__global "
274 "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
275 "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
276 "%s*)exportLocalBuffer, (size_t)exportSrcLocalSize, events );\n"
277 // Wait for the import and export to complete, then verify by manually
278 // copying to the dest
279 " wait_group_events( 2, &events );\n"
280 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
281 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
282 "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
283 " }\n"
284 "}\n";
285
286 static const char *export_after_import_aliased_global_and_local_kernel =
287 "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
288 "%s\n" // optional pragma string
289 "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
290 "*importDst,\n"
291 " /* there isn't a dedicated __global %s buffer for "
292 "export src since the local memory is aliased, so the export src is taken "
293 "from it */\n"
294 " /* there isn't a dedicated __global %s buffer for "
295 "export dst since import src and export dst are aliased */\n"
296 " __local %s *localBuffer, /* there isn't another "
297 "__local %s local buffer since import dst and export src are aliased*/\n"
298 " int importSrcLocalSize, int "
299 "importCopiesPerWorkItem,\n"
300 " int exportSrcLocalSize, int "
301 "exportCopiesPerWorkItem )\n"
302 "{\n"
303 " int i;\n"
304 // Zero the local storage first
305 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
306 " localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
307 "(%s)(%s)0;\n"
308 " }\n"
309 " // no need to set another local buffer values to (%s)(%s)0 since "
310 "import dst and export src are aliased (use the same buffer)\n"
311 // Do this to verify all kernels are done zeroing the local buffer before we
312 // try the import and export
313 " barrier( CLK_LOCAL_MEM_FENCE );\n"
314 " event_t events;\n"
315 " events = async_work_group_copy( (__local %s*)localBuffer, (__global "
316 "const %s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
317 "(size_t)importSrcLocalSize, 0 );\n"
318 " async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
319 "CLK_LOCAL_MEM_FENCE );\n"
320 " events = async_work_group_copy((__global "
321 "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
322 "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
323 "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
324 "(size_t)exportSrcLocalSize, events );\n"
325 // Wait for the import and export to complete, then verify by manually
326 // copying to the dest
327 " wait_group_events( 2, &events );\n"
328 " for(i=0; i<importCopiesPerWorkItem; i++) {\n"
329 " importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
330 "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
331 " }\n"
332 "}\n";
333
test_copy_fence(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,ExplicitType vecType,int vecSize,bool export_after_import,bool aliased_local_mem,bool aliased_global_mem)334 int test_copy_fence(cl_device_id deviceID, cl_context context,
335 cl_command_queue queue, const char *kernelCode,
336 ExplicitType vecType, int vecSize, bool export_after_import,
337 bool aliased_local_mem, bool aliased_global_mem)
338 {
339 int error;
340 clProgramWrapper program;
341 clKernelWrapper kernel;
342 clMemWrapper streams[4];
343 size_t threads[1], localThreads[1];
344 void *transaction1InBuffer, *transaction1OutBuffer, *transaction2InBuffer,
345 *transaction2OutBuffer;
346 MTdata d;
347 bool transaction1DstIsTransaction2Src =
348 (aliased_global_mem && !export_after_import)
349 || (aliased_local_mem && export_after_import);
350 bool transaction1SrcIsTransaction2Dst =
351 aliased_global_mem && export_after_import;
352 char vecNameString[64];
353 vecNameString[0] = 0;
354 if (vecSize == 1)
355 sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
356 else
357 sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
358 vecSize);
359
360 size_t elementSize = get_explicit_type_size(vecType) * vecSize;
361 log_info("Testing %s\n", vecNameString);
362
363 cl_long max_local_mem_size;
364 error =
365 clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
366 sizeof(max_local_mem_size), &max_local_mem_size, NULL);
367 test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
368
369 unsigned int num_of_compute_devices;
370 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
371 sizeof(num_of_compute_devices),
372 &num_of_compute_devices, NULL);
373 test_error(error,
374 "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
375
376 char programSource[4096];
377 programSource[0] = 0;
378 char *programPtr;
379
380 sprintf(programSource, kernelCode,
381 vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
382 : "",
383 vecNameString, vecNameString, vecNameString, vecNameString,
384 vecNameString, vecNameString, vecNameString,
385 get_explicit_type_name(vecType), vecNameString,
386 get_explicit_type_name(vecType), vecNameString, vecNameString,
387 vecNameString, vecNameString);
388 // log_info("program: %s\n", programSource);
389 programPtr = programSource;
390
391 error = create_single_kernel_helper(context, &program, &kernel, 1,
392 (const char **)&programPtr, "test_fn");
393 test_error(error, "Unable to create testing kernel");
394
395 size_t max_workgroup_size;
396 error = clGetKernelWorkGroupInfo(
397 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
398 &max_workgroup_size, NULL);
399 test_error(
400 error,
401 "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
402
403 size_t max_local_workgroup_size[3];
404 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
405 sizeof(max_local_workgroup_size),
406 max_local_workgroup_size, NULL);
407 test_error(error,
408 "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
409
410 // Pick the minimum of the device and the kernel
411 if (max_workgroup_size > max_local_workgroup_size[0])
412 max_workgroup_size = max_local_workgroup_size[0];
413
414 size_t transaction1NumberOfCopiesPerWorkitem = 13;
415 size_t transaction2NumberOfCopiesPerWorkitem = 2;
416 elementSize =
417 get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
418 size_t localStorageSpacePerWorkitem =
419 transaction1NumberOfCopiesPerWorkitem * elementSize
420 + (aliased_local_mem
421 ? 0
422 : transaction2NumberOfCopiesPerWorkitem * elementSize);
423 size_t maxLocalWorkgroupSize =
424 (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
425
426 // Calculation can return 0 on embedded devices due to 1KB local mem limit
427 if (maxLocalWorkgroupSize == 0)
428 {
429 maxLocalWorkgroupSize = 1;
430 }
431
432 size_t localWorkgroupSize = maxLocalWorkgroupSize;
433 if (maxLocalWorkgroupSize > max_workgroup_size)
434 localWorkgroupSize = max_workgroup_size;
435
436 size_t transaction1LocalBufferSize = localWorkgroupSize * elementSize
437 * transaction1NumberOfCopiesPerWorkitem;
438 size_t transaction2LocalBufferSize = localWorkgroupSize * elementSize
439 * transaction2NumberOfCopiesPerWorkitem; // irrelevant if
440 // aliased_local_mem
441 size_t numberOfLocalWorkgroups = 1111;
442 size_t transaction1GlobalBufferSize =
443 numberOfLocalWorkgroups * transaction1LocalBufferSize;
444 size_t transaction2GlobalBufferSize =
445 numberOfLocalWorkgroups * transaction2LocalBufferSize;
446 size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
447
448 transaction1InBuffer = (void *)malloc(transaction1GlobalBufferSize);
449 transaction1OutBuffer = (void *)malloc(transaction1GlobalBufferSize);
450 transaction2InBuffer = (void *)malloc(transaction2GlobalBufferSize);
451 transaction2OutBuffer = (void *)malloc(transaction2GlobalBufferSize);
452 memset(transaction1OutBuffer, 0, transaction1GlobalBufferSize);
453 memset(transaction2OutBuffer, 0, transaction2GlobalBufferSize);
454
455 cl_int transaction1CopiesPerWorkitemInt, transaction1CopiesPerWorkgroup,
456 transaction2CopiesPerWorkitemInt, transaction2CopiesPerWorkgroup;
457 transaction1CopiesPerWorkitemInt =
458 (int)transaction1NumberOfCopiesPerWorkitem;
459 transaction1CopiesPerWorkgroup =
460 (int)(transaction1NumberOfCopiesPerWorkitem * localWorkgroupSize);
461 transaction2CopiesPerWorkitemInt =
462 (int)transaction2NumberOfCopiesPerWorkitem;
463 transaction2CopiesPerWorkgroup =
464 (int)(transaction2NumberOfCopiesPerWorkitem * localWorkgroupSize);
465
466 log_info(
467 "Global: %d, local %d. 1st Transaction: local buffer %db, global "
468 "buffer %db, each work group will copy %d elements and each work "
469 "item item will copy %d elements. 2nd Transaction: local buffer "
470 "%db, global buffer %db, each work group will copy %d elements and "
471 "each work item will copy %d elements\n",
472 (int)globalWorkgroupSize, (int)localWorkgroupSize,
473 (int)transaction1LocalBufferSize, (int)transaction1GlobalBufferSize,
474 transaction1CopiesPerWorkgroup, transaction1CopiesPerWorkitemInt,
475 (int)transaction2LocalBufferSize, (int)transaction2GlobalBufferSize,
476 transaction2CopiesPerWorkgroup, transaction2CopiesPerWorkitemInt);
477
478 threads[0] = globalWorkgroupSize;
479 localThreads[0] = localWorkgroupSize;
480
481 d = init_genrand(gRandomSeed);
482 generate_random_data(
483 vecType, transaction1GlobalBufferSize / get_explicit_type_size(vecType),
484 d, transaction1InBuffer);
485 if (!transaction1DstIsTransaction2Src)
486 {
487 generate_random_data(vecType,
488 transaction2GlobalBufferSize
489 / get_explicit_type_size(vecType),
490 d, transaction2InBuffer);
491 }
492 free_mtdata(d);
493 d = NULL;
494
495 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
496 transaction1GlobalBufferSize,
497 transaction1InBuffer, &error);
498 test_error(error, "Unable to create input buffer");
499 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
500 transaction1GlobalBufferSize,
501 transaction1OutBuffer, &error);
502 test_error(error, "Unable to create output buffer");
503 if (!transaction1DstIsTransaction2Src)
504 {
505 streams[2] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
506 transaction2GlobalBufferSize,
507 transaction2InBuffer, &error);
508 test_error(error, "Unable to create input buffer");
509 }
510 if (!transaction1SrcIsTransaction2Dst)
511 {
512 streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
513 transaction2GlobalBufferSize,
514 transaction2OutBuffer, &error);
515 test_error(error, "Unable to create output buffer");
516 }
517
518 cl_uint argIndex = 0;
519 error = clSetKernelArg(kernel, argIndex, sizeof(streams[0]), &streams[0]);
520 test_error(error, "Unable to set kernel argument");
521 ++argIndex;
522 error = clSetKernelArg(kernel, argIndex, sizeof(streams[1]), &streams[1]);
523 test_error(error, "Unable to set kernel argument");
524 ++argIndex;
525 if (!transaction1DstIsTransaction2Src)
526 {
527 error =
528 clSetKernelArg(kernel, argIndex, sizeof(streams[2]), &streams[2]);
529 test_error(error, "Unable to set kernel argument");
530 ++argIndex;
531 }
532 if (!transaction1SrcIsTransaction2Dst)
533 {
534 error =
535 clSetKernelArg(kernel, argIndex, sizeof(streams[3]), &streams[3]);
536 test_error(error, "Unable to set kernel argument");
537 ++argIndex;
538 }
539 error = clSetKernelArg(kernel, argIndex, transaction1LocalBufferSize, NULL);
540 test_error(error, "Unable to set kernel argument");
541 ++argIndex;
542 if (!aliased_local_mem)
543 {
544 error =
545 clSetKernelArg(kernel, argIndex, transaction2LocalBufferSize, NULL);
546 test_error(error, "Unable to set kernel argument");
547 ++argIndex;
548 }
549 error =
550 clSetKernelArg(kernel, argIndex, sizeof(transaction1CopiesPerWorkgroup),
551 &transaction1CopiesPerWorkgroup);
552 test_error(error, "Unable to set kernel argument");
553 ++argIndex;
554 error = clSetKernelArg(kernel, argIndex,
555 sizeof(transaction1CopiesPerWorkitemInt),
556 &transaction1CopiesPerWorkitemInt);
557 test_error(error, "Unable to set kernel argument");
558 ++argIndex;
559 error =
560 clSetKernelArg(kernel, argIndex, sizeof(transaction2CopiesPerWorkgroup),
561 &transaction2CopiesPerWorkgroup);
562 test_error(error, "Unable to set kernel argument");
563 ++argIndex;
564 error = clSetKernelArg(kernel, argIndex,
565 sizeof(transaction2CopiesPerWorkitemInt),
566 &transaction2CopiesPerWorkitemInt);
567 test_error(error, "Unable to set kernel argument");
568
569 // Enqueue
570 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
571 localThreads, 0, NULL, NULL);
572 test_error(error, "Unable to queue kernel");
573
574 // Read
575 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
576 transaction1GlobalBufferSize,
577 transaction1OutBuffer, 0, NULL, NULL);
578 test_error(error, "Unable to read results");
579 if (transaction1DstIsTransaction2Src)
580 {
581 for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
582 {
583 memcpy(
584 (void *)((unsigned char *)transaction2InBuffer
585 + idx * transaction2CopiesPerWorkgroup * elementSize),
586 (const void *)((unsigned char *)transaction1OutBuffer
587 + (idx * transaction1CopiesPerWorkgroup
588 + (transaction1CopiesPerWorkgroup
589 - transaction2CopiesPerWorkgroup))
590 * elementSize),
591 (size_t)transaction2CopiesPerWorkgroup * elementSize);
592 }
593 }
594 if (transaction1SrcIsTransaction2Dst)
595 {
596 void *transaction1SrcBuffer =
597 (void *)malloc(transaction1GlobalBufferSize);
598 error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
599 transaction1GlobalBufferSize,
600 transaction1SrcBuffer, 0, NULL, NULL);
601 test_error(error, "Unable to read results");
602 for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
603 {
604 memcpy(
605 (void *)((unsigned char *)transaction2OutBuffer
606 + idx * transaction2CopiesPerWorkgroup * elementSize),
607 (const void *)((unsigned char *)transaction1SrcBuffer
608 + (idx * transaction1CopiesPerWorkgroup
609 + (transaction1CopiesPerWorkgroup
610 - transaction2CopiesPerWorkgroup))
611 * elementSize),
612 (size_t)transaction2CopiesPerWorkgroup * elementSize);
613 }
614 free(transaction1SrcBuffer);
615 }
616 else
617 {
618 error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
619 transaction2GlobalBufferSize,
620 transaction2OutBuffer, 0, NULL, NULL);
621 test_error(error, "Unable to read results");
622 }
623
624 // Verify
625 int failuresPrinted = 0;
626 if (memcmp(transaction1InBuffer, transaction1OutBuffer,
627 transaction1GlobalBufferSize)
628 != 0)
629 {
630 size_t typeSize = get_explicit_type_size(vecType) * vecSize;
631 unsigned char *inchar = (unsigned char *)transaction1InBuffer;
632 unsigned char *outchar = (unsigned char *)transaction1OutBuffer;
633 for (int i = 0; i < (int)transaction1GlobalBufferSize;
634 i += (int)elementSize)
635 {
636 if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
637 != 0)
638 {
639 char values[4096];
640 values[0] = 0;
641 if (failuresPrinted == 0)
642 {
643 // Print first failure message
644 log_error("ERROR: Results of 1st transaction did not "
645 "validate!\n");
646 }
647 sprintf(values + strlen(values), "%d -> [", i);
648 for (int j = 0; j < (int)elementSize; j++)
649 sprintf(values + strlen(values), "%2x ", inchar[i + j]);
650 sprintf(values + strlen(values), "] != [");
651 for (int j = 0; j < (int)elementSize; j++)
652 sprintf(values + strlen(values), "%2x ", outchar[i + j]);
653 sprintf(values + strlen(values), "]");
654 log_error("%s\n", values);
655 failuresPrinted++;
656 }
657
658 if (failuresPrinted > 5)
659 {
660 log_error("Not printing further failures...\n");
661 break;
662 }
663 }
664 }
665 if (memcmp(transaction2InBuffer, transaction2OutBuffer,
666 transaction2GlobalBufferSize)
667 != 0)
668 {
669 size_t typeSize = get_explicit_type_size(vecType) * vecSize;
670 unsigned char *inchar = (unsigned char *)transaction2InBuffer;
671 unsigned char *outchar = (unsigned char *)transaction2OutBuffer;
672 for (int i = 0; i < (int)transaction2GlobalBufferSize;
673 i += (int)elementSize)
674 {
675 if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
676 != 0)
677 {
678 char values[4096];
679 values[0] = 0;
680 if (failuresPrinted == 0)
681 {
682 // Print first failure message
683 log_error("ERROR: Results of 2nd transaction did not "
684 "validate!\n");
685 }
686 sprintf(values + strlen(values), "%d -> [", i);
687 for (int j = 0; j < (int)elementSize; j++)
688 sprintf(values + strlen(values), "%2x ", inchar[i + j]);
689 sprintf(values + strlen(values), "] != [");
690 for (int j = 0; j < (int)elementSize; j++)
691 sprintf(values + strlen(values), "%2x ", outchar[i + j]);
692 sprintf(values + strlen(values), "]");
693 log_error("%s\n", values);
694 failuresPrinted++;
695 }
696
697 if (failuresPrinted > 5)
698 {
699 log_error("Not printing further failures...\n");
700 break;
701 }
702 }
703 }
704
705 free(transaction1InBuffer);
706 free(transaction1OutBuffer);
707 free(transaction2InBuffer);
708 free(transaction2OutBuffer);
709
710 return failuresPrinted ? -1 : 0;
711 }
712
test_copy_fence_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool export_after_import,bool aliased_local_mem,bool aliased_global_mem)713 int test_copy_fence_all_types(cl_device_id deviceID, cl_context context,
714 cl_command_queue queue, const char *kernelCode,
715 bool export_after_import, bool aliased_local_mem,
716 bool aliased_global_mem)
717 {
718 ExplicitType vecType[] = {
719 kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong,
720 kULong, kFloat, kDouble, kNumExplicitTypes
721 };
722 unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
723 unsigned int size, typeIndex;
724
725 int errors = 0;
726
727 if (!is_extension_available(deviceID, "cl_khr_async_work_group_copy_fence"))
728 {
729 log_info(
730 "Device does not support extended async copies fence. Skipping "
731 "test.\n");
732 return 0;
733 }
734
735 for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
736 {
737 if (vecType[typeIndex] == kDouble
738 && !is_extension_available(deviceID, "cl_khr_fp64"))
739 continue;
740
741 if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
742 && !gHasLong)
743 continue;
744
745 for (size = 0; vecSizes[size] != 0; size++)
746 {
747 if (test_copy_fence(deviceID, context, queue, kernelCode,
748 vecType[typeIndex], vecSizes[size],
749 export_after_import, aliased_local_mem,
750 aliased_global_mem))
751 {
752 errors++;
753 }
754 }
755 }
756 if (errors) return -1;
757 return 0;
758 }
759
test_async_work_group_copy_fence_import_after_export_aliased_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)760 int test_async_work_group_copy_fence_import_after_export_aliased_local(
761 cl_device_id deviceID, cl_context context, cl_command_queue queue,
762 int num_elements)
763 {
764 return test_copy_fence_all_types(deviceID, context, queue,
765 import_after_export_aliased_local_kernel,
766 false, true, false);
767 }
768
test_async_work_group_copy_fence_import_after_export_aliased_global(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)769 int test_async_work_group_copy_fence_import_after_export_aliased_global(
770 cl_device_id deviceID, cl_context context, cl_command_queue queue,
771 int num_elements)
772 {
773 return test_copy_fence_all_types(deviceID, context, queue,
774 import_after_export_aliased_global_kernel,
775 false, false, true);
776 }
777
test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)778 int test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(
779 cl_device_id deviceID, cl_context context, cl_command_queue queue,
780 int num_elements)
781 {
782 return test_copy_fence_all_types(
783 deviceID, context, queue,
784 import_after_export_aliased_global_and_local_kernel, false, true, true);
785 }
786
test_async_work_group_copy_fence_export_after_import_aliased_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)787 int test_async_work_group_copy_fence_export_after_import_aliased_local(
788 cl_device_id deviceID, cl_context context, cl_command_queue queue,
789 int num_elements)
790 {
791 return test_copy_fence_all_types(deviceID, context, queue,
792 export_after_import_aliased_local_kernel,
793 true, true, false);
794 }
795
test_async_work_group_copy_fence_export_after_import_aliased_global(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)796 int test_async_work_group_copy_fence_export_after_import_aliased_global(
797 cl_device_id deviceID, cl_context context, cl_command_queue queue,
798 int num_elements)
799 {
800 return test_copy_fence_all_types(deviceID, context, queue,
801 export_after_import_aliased_global_kernel,
802 true, false, true);
803 }
804
test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)805 int test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(
806 cl_device_id deviceID, cl_context context, cl_command_queue queue,
807 int num_elements)
808 {
809 return test_copy_fence_all_types(
810 deviceID, context, queue,
811 export_after_import_aliased_global_and_local_kernel, true, true, true);
812 }
813