1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17 
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27 
28 static const char *async_global_to_local_kernel3D =
29     "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
30     "%s\n" // optional pragma string
31     "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
32     "%s *localBuffer, int numElementsPerLine, int numLines, int "
33     "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
34     "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
35     "{\n"
36     " int i, j, k;\n"
37     // Zero the local storage first
38     " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
39     "   for(j=0; j<numLines; j++)\n"
40     "     for(k=0; k<numElementsPerLine; k++)\n"
41     "       localBuffer[ (get_local_id( 0 "
42     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
43     "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
44     "dstLineStride) + k ] = (%s)(%s)0;\n"
45     // Do this to verify all kernels are done zeroing the local buffer before we
46     // try the copy
47     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
48     "    event_t event;\n"
49     "    event = async_work_group_copy_3D3D( (__local %s*)localBuffer, "
50     "(__global const "
51     "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
52     "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), "
53     "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, "
54     "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, "
55     "0 );\n"
56     // Wait for the copy to complete, then verify by manually copying to the
57     // dest
58     " wait_group_events( 1, &event );\n"
59     " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
60     "   for(j=0; j<numLines; j++)\n"
61     "     for(k=0; k<numElementsPerLine; k++)\n"
62     "       dst[ (get_global_id( 0 "
63     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
64     "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
65     "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 "
66     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
67     "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
68     "dstLineStride) + k ];\n"
69     "}\n";
70 
71 static const char *async_local_to_global_kernel3D =
72     "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
73     "%s\n" // optional pragma string
74     "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
75     "%s *localBuffer, int numElementsPerLine, int numLines, int "
76     "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
77     "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
78     "{\n"
79     " int i, j, k;\n"
80     // Zero the local storage first
81     " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
82     "   for(j=0; j<numLines; j++)\n"
83     "     for(k=0; k<numElementsPerLine; k++)\n"
84     "       localBuffer[ (get_local_id( 0 "
85     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
86     "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
87     "srcLineStride) + k ] = (%s)(%s)0;\n"
88     // Do this to verify all kernels are done zeroing the local buffer before we
89     // try the copy
90     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
91     " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
92     "   for(j=0; j<numLines; j++)\n"
93     "     for(k=0; k<numElementsPerLine; k++)\n"
94     "       localBuffer[ (get_local_id( 0 "
95     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
96     "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
97     "srcLineStride) + k ] = src[ (get_global_id( 0 "
98     ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
99     "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
100     "srcLineStride) + k ];\n"
101     // Do this to verify all kernels are done copying to the local buffer before
102     // we try the copy
103     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
104     "    event_t event;\n"
105     "    event = async_work_group_copy_3D3D((__global "
106     "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
107     "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local "
108     "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, "
109     "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, "
110     "dstPlaneStride, 0 );\n"
111     "    wait_group_events( 1, &event );\n"
112     "}\n";
113 
test_copy3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,ExplicitType vecType,int vecSize,int srcLineStride,int dstLineStride,int srcPlaneStride,int dstPlaneStride,bool localIsDst)114 int test_copy3D(cl_device_id deviceID, cl_context context,
115                 cl_command_queue queue, const char *kernelCode,
116                 ExplicitType vecType, int vecSize, int srcLineStride,
117                 int dstLineStride, int srcPlaneStride, int dstPlaneStride,
118                 bool localIsDst)
119 {
120     int error;
121     clProgramWrapper program;
122     clKernelWrapper kernel;
123     clMemWrapper streams[2];
124     size_t threads[1], localThreads[1];
125     void *inBuffer, *outBuffer, *outBufferCopy;
126     MTdata d;
127     char vecNameString[64];
128     vecNameString[0] = 0;
129     if (vecSize == 1)
130         sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
131     else
132         sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
133                 vecSize);
134 
135     size_t elementSize = get_explicit_type_size(vecType) * vecSize;
136     log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, "
137              "srcPlaneStride = %d, dstPlaneStride = %d\n",
138              vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
139              dstPlaneStride);
140 
141     cl_long max_local_mem_size;
142     error =
143         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
144                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);
145     test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
146 
147     cl_long max_global_mem_size;
148     error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
149                             sizeof(max_global_mem_size), &max_global_mem_size,
150                             NULL);
151     test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
152 
153     cl_long max_alloc_size;
154     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
155                             sizeof(max_alloc_size), &max_alloc_size, NULL);
156     test_error(error,
157                "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
158 
159     if (max_alloc_size > max_global_mem_size / 2)
160         max_alloc_size = max_global_mem_size / 2;
161 
162     unsigned int num_of_compute_devices;
163     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
164                             sizeof(num_of_compute_devices),
165                             &num_of_compute_devices, NULL);
166     test_error(error,
167                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
168 
169     char programSource[4096];
170     programSource[0] = 0;
171     char *programPtr;
172 
173     sprintf(programSource, kernelCode,
174             vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
175                                : "",
176             vecNameString, vecNameString, vecNameString, vecNameString,
177             get_explicit_type_name(vecType), vecNameString, vecNameString);
178     // log_info("program: %s\n", programSource);
179     programPtr = programSource;
180 
181     error = create_single_kernel_helper(context, &program, &kernel, 1,
182                                         (const char **)&programPtr, "test_fn");
183     test_error(error, "Unable to create testing kernel");
184 
185     size_t max_workgroup_size;
186     error = clGetKernelWorkGroupInfo(
187         kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
188         &max_workgroup_size, NULL);
189     test_error(
190         error,
191         "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
192 
193     size_t max_local_workgroup_size[3];
194     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
195                             sizeof(max_local_workgroup_size),
196                             max_local_workgroup_size, NULL);
197     test_error(error,
198                "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
199 
200     // Pick the minimum of the device and the kernel
201     if (max_workgroup_size > max_local_workgroup_size[0])
202         max_workgroup_size = max_local_workgroup_size[0];
203 
204     size_t numElementsPerLine = 10;
205     size_t numLines = 13;
206     size_t planesCopiesPerWorkItem = 2;
207     elementSize =
208         get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
209     size_t localStorageSpacePerWorkitem = elementSize
210         * (planesCopiesPerWorkItem
211            * (numLines * numElementsPerLine
212               + numLines * (localIsDst ? dstLineStride : srcLineStride)
213               + (localIsDst ? dstPlaneStride : srcPlaneStride)));
214     size_t maxLocalWorkgroupSize =
215         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
216 
217     // Calculation can return 0 on embedded devices due to 1KB local mem limit
218     if (maxLocalWorkgroupSize == 0)
219     {
220         maxLocalWorkgroupSize = 1;
221     }
222 
223     size_t localWorkgroupSize = maxLocalWorkgroupSize;
224     if (maxLocalWorkgroupSize > max_workgroup_size)
225         localWorkgroupSize = max_workgroup_size;
226 
227     size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride)
228         / ((numLines * numElementsPerLine + numLines * srcLineStride)
229            + srcPlaneStride);
230     size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
231         / ((numLines * numElementsPerLine + numLines * dstLineStride)
232            + dstPlaneStride);
233     size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
234     size_t maxLocalWorkgroups =
235         maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
236 
237     size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
238         - (localIsDst ? dstPlaneStride : srcPlaneStride);
239     size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
240     size_t totalPlanes =
241         numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
242     size_t inBufferSize = elementSize
243         * (totalPlanes
244                * (numLines * numElementsPerLine + numLines * srcLineStride)
245            + (totalPlanes - 1) * srcPlaneStride);
246     size_t outBufferSize = elementSize
247         * (totalPlanes
248                * (numLines * numElementsPerLine + numLines * dstLineStride)
249            + (totalPlanes - 1) * dstPlaneStride);
250     size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
251 
252     inBuffer = (void *)malloc(inBufferSize);
253     outBuffer = (void *)malloc(outBufferSize);
254     outBufferCopy = (void *)malloc(outBufferSize);
255 
256     cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt,
257         planesCopiesPerWorkgroup;
258     planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem;
259     numElementsPerLineInt = (int)numElementsPerLine;
260     numLinesInt = (int)numLines;
261     planesCopiesPerWorkgroup =
262         (int)(planesCopiesPerWorkItem * localWorkgroupSize);
263 
264     log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
265              "global out buffer %db, each work group will copy %d planes and "
266              "each work item item will copy %d planes.\n",
267              (int)globalWorkgroupSize, (int)localWorkgroupSize,
268              (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
269              planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
270 
271     threads[0] = globalWorkgroupSize;
272     localThreads[0] = localWorkgroupSize;
273 
274     d = init_genrand(gRandomSeed);
275     generate_random_data(
276         vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
277     generate_random_data(
278         vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
279     free_mtdata(d);
280     d = NULL;
281     memcpy(outBufferCopy, outBuffer, outBufferSize);
282 
283     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
284                                 inBuffer, &error);
285     test_error(error, "Unable to create input buffer");
286     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
287                                 outBuffer, &error);
288     test_error(error, "Unable to create output buffer");
289 
290     error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
291     test_error(error, "Unable to set kernel argument");
292     error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
293     test_error(error, "Unable to set kernel argument");
294     error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
295     test_error(error, "Unable to set kernel argument");
296     error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
297                            &numElementsPerLineInt);
298     test_error(error, "Unable to set kernel argument");
299     error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt);
300     test_error(error, "Unable to set kernel argument");
301     error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup),
302                            &planesCopiesPerWorkgroup);
303     test_error(error, "Unable to set kernel argument");
304     error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt),
305                            &planesCopiesPerWorkItemInt);
306     test_error(error, "Unable to set kernel argument");
307     error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride);
308     test_error(error, "Unable to set kernel argument");
309     error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride);
310     test_error(error, "Unable to set kernel argument");
311     error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride);
312     test_error(error, "Unable to set kernel argument");
313     error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride);
314     test_error(error, "Unable to set kernel argument");
315 
316     // Enqueue
317     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
318                                    localThreads, 0, NULL, NULL);
319     test_error(error, "Unable to queue kernel");
320 
321     // Read
322     error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
323                                 outBuffer, 0, NULL, NULL);
324     test_error(error, "Unable to read results");
325 
326     // Verify
327     int failuresPrinted = 0;
328     // Verify
329     size_t typeSize = get_explicit_type_size(vecType) * vecSize;
330     for (int i = 0;
331          i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
332          i += elementSize)
333     {
334         for (int j = 0; j < (int)numLines * elementSize; j += elementSize)
335         {
336             for (int k = 0; k < (int)numElementsPerLine * elementSize;
337                  k += elementSize)
338             {
339                 int inIdx = i
340                         * (numLines * numElementsPerLine
341                            + numLines * srcLineStride + srcPlaneStride)
342                     + j * (numElementsPerLine + srcLineStride) + k;
343                 int outIdx = i
344                         * (numLines * numElementsPerLine
345                            + numLines * dstLineStride + dstPlaneStride)
346                     + j * (numElementsPerLine + dstLineStride) + k;
347                 if (memcmp(((char *)inBuffer) + inIdx,
348                            ((char *)outBuffer) + outIdx, typeSize)
349                     != 0)
350                 {
351                     unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
352                     unsigned char *outchar =
353                         (unsigned char *)outBuffer + outIdx;
354                     char values[4096];
355                     values[0] = 0;
356 
357                     if (failuresPrinted == 0)
358                     {
359                         // Print first failure message
360                         log_error("ERROR: Results of copy did not validate!");
361                     }
362                     sprintf(values + strlen(values), "%d -> [", inIdx);
363                     for (int l = 0; l < (int)elementSize; l++)
364                         sprintf(values + strlen(values), "%2x ", inchar[l]);
365                     sprintf(values + strlen(values), "] != [");
366                     for (int l = 0; l < (int)elementSize; l++)
367                         sprintf(values + strlen(values), "%2x ", outchar[l]);
368                     sprintf(values + strlen(values), "]");
369                     log_error("%s\n", values);
370                     failuresPrinted++;
371                 }
372 
373                 if (failuresPrinted > 5)
374                 {
375                     log_error("Not printing further failures...\n");
376                     return -1;
377                 }
378             }
379             if (j < (int)numLines * elementSize)
380             {
381                 int outIdx = i
382                         * (numLines * numElementsPerLine
383                            + numLines * dstLineStride + dstPlaneStride)
384                     + j * (numElementsPerLine + dstLineStride)
385                     + numElementsPerLine * elementSize;
386                 if (memcmp(((char *)outBuffer) + outIdx,
387                            ((char *)outBufferCopy) + outIdx,
388                            dstLineStride * elementSize)
389                     != 0)
390                 {
391                     if (failuresPrinted == 0)
392                     {
393                         // Print first failure message
394                         log_error("ERROR: Results of copy did not validate!\n");
395                     }
396                     log_error(
397                         "3D copy corrupted data in output buffer in the line "
398                         "stride offset of plane %d line %d\n",
399                         i, j);
400                     failuresPrinted++;
401                 }
402                 if (failuresPrinted > 5)
403                 {
404                     log_error("Not printing further failures...\n");
405                     return -1;
406                 }
407             }
408         }
409         if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
410                 * elementSize)
411         {
412             int outIdx = i
413                     * (numLines * numElementsPerLine + numLines * dstLineStride
414                        + dstPlaneStride)
415                 + (numLines * elementSize) * (numElementsPerLine)
416                 + (numLines * elementSize) * (dstLineStride);
417             if (memcmp(((char *)outBuffer) + outIdx,
418                        ((char *)outBufferCopy) + outIdx,
419                        dstPlaneStride * elementSize)
420                 != 0)
421             {
422                 if (failuresPrinted == 0)
423                 {
424                     // Print first failure message
425                     log_error("ERROR: Results of copy did not validate!\n");
426                 }
427                 log_error("3D copy corrupted data in output buffer in the "
428                           "plane stride "
429                           "offset of plane %d\n",
430                           i);
431                 failuresPrinted++;
432             }
433             if (failuresPrinted > 5)
434             {
435                 log_error("Not printing further failures...\n");
436                 return -1;
437             }
438         }
439     }
440 
441     free(inBuffer);
442     free(outBuffer);
443     free(outBufferCopy);
444 
445     return failuresPrinted ? -1 : 0;
446 }
447 
test_copy3D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)448 int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
449                           cl_command_queue queue, const char *kernelCode,
450                           bool localIsDst)
451 {
452     ExplicitType vecType[] = {
453         kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
454         kULong, kFloat, kDouble, kNumExplicitTypes
455     };
456     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
457     unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
458     unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride,
459         dstPlaneStride;
460 
461     int errors = 0;
462 
463     if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
464     {
465         log_info(
466             "Device does not support extended async copies. Skipping test.\n");
467         return 0;
468     }
469 
470     for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
471     {
472         if (vecType[typeIndex] == kDouble
473             && !is_extension_available(deviceID, "cl_khr_fp64"))
474             continue;
475 
476         if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
477             && !gHasLong)
478             continue;
479 
480         for (size = 0; vecSizes[size] != 0; size++)
481         {
482             if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
483                 <= 2) // small type
484             {
485                 for (srcLineStride = 0;
486                      srcLineStride < sizeof(smallTypesStrideSizes)
487                          / sizeof(smallTypesStrideSizes[0]);
488                      srcLineStride++)
489                 {
490                     for (dstLineStride = 0;
491                          dstLineStride < sizeof(smallTypesStrideSizes)
492                              / sizeof(smallTypesStrideSizes[0]);
493                          dstLineStride++)
494                     {
495                         for (srcPlaneStride = 0;
496                              srcPlaneStride < sizeof(smallTypesStrideSizes)
497                                  / sizeof(smallTypesStrideSizes[0]);
498                              srcPlaneStride++)
499                         {
500                             for (dstPlaneStride = 0;
501                                  dstPlaneStride < sizeof(smallTypesStrideSizes)
502                                      / sizeof(smallTypesStrideSizes[0]);
503                                  dstPlaneStride++)
504                             {
505                                 if (test_copy3D(
506                                         deviceID, context, queue, kernelCode,
507                                         vecType[typeIndex], vecSizes[size],
508                                         smallTypesStrideSizes[srcLineStride],
509                                         smallTypesStrideSizes[dstLineStride],
510                                         smallTypesStrideSizes[srcPlaneStride],
511                                         smallTypesStrideSizes[dstPlaneStride],
512                                         localIsDst))
513                                 {
514                                     errors++;
515                                 }
516                             }
517                         }
518                     }
519                 }
520             }
521             // not a small type, check only zero stride
522             else if (test_copy3D(deviceID, context, queue, kernelCode,
523                                  vecType[typeIndex], vecSizes[size], 0, 0, 0, 0,
524                                  localIsDst))
525             {
526                 errors++;
527             }
528         }
529     }
530     if (errors) return -1;
531     return 0;
532 }
533 
test_async_copy_global_to_local3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)534 int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context,
535                                       cl_command_queue queue, int num_elements)
536 {
537     return test_copy3D_all_types(deviceID, context, queue,
538                                  async_global_to_local_kernel3D, true);
539 }
540 
test_async_copy_local_to_global3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)541 int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context,
542                                       cl_command_queue queue, int num_elements)
543 {
544     return test_copy3D_all_types(deviceID, context, queue,
545                                  async_local_to_global_kernel3D, false);
546 }
547