1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20 
21 #include <vector>
22 
23 #include "procs.h"
24 #include "utils.h"
25 #include <time.h>
26 
27 
28 #ifdef CL_VERSION_2_0
29 extern int gWimpyMode;
30 static const char *helper_ndrange_1d_glo[] = {
31     NL,
32     "void block_fn(int len, __global atomic_uint* val)" NL,
33     "{" NL,
34     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
35     "memory_order_relaxed, memory_scope_device);" NL,
36     "}" NL,
37     "" NL,
38     "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
39     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
40     "atomic_uint* val,  __global uint* ofs_arr)" NL,
41     "{" NL,
42     "  size_t tid = get_global_id(0);" NL,
43     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
44     "" NL,
45     "  for(int i = 0; i < n; i++)" NL,
46     "  {" NL,
47     "    ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
48     "    int enq_res = enqueue_kernel(get_default_queue(), "
49     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
50     "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
51     "  }" NL,
52     "}" NL
53 };
54 
55 static const char *helper_ndrange_1d_loc[] = {
56     NL,
57     "void block_fn(int len, __global atomic_uint* val)" NL,
58     "{" NL,
59     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
60     "memory_order_relaxed, memory_scope_device);" NL,
61     "}" NL,
62     "" NL,
63     "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
64     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
65     "atomic_uint* val,  __global uint* ofs_arr)" NL,
66     "{" NL,
67     "  size_t tid = get_global_id(0);" NL,
68     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
69     "" NL,
70     "  for(int k = 0; k < n; k++)" NL,
71     "  {" NL,
72     "    for(int i = 0; i < n; i++)" NL,
73     "    {" NL,
74     "      if (glob_size_arr[i] >= loc_size_arr[k])" NL,
75     "      {" NL,
76     "        ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
77     "loc_size_arr[k]);" NL,
78     "        int enq_res = enqueue_kernel(get_default_queue(), "
79     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
80     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
81     "      }" NL,
82     "    }" NL,
83     "  }" NL,
84     "}" NL
85 };
86 
87 static const char *helper_ndrange_1d_ofs[] = {
88     NL,
89     "void block_fn(int len, __global atomic_uint* val)" NL,
90     "{" NL,
91     "  atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
92     "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
93     "memory_scope_device);" NL,
94     "}" NL,
95     "" NL,
96     "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
97     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
98     "atomic_uint* val,  __global uint* ofs_arr)" NL,
99     "{" NL,
100     "  size_t tid = get_global_id(0);" NL,
101     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
102     "" NL,
103     "  for(int l = 0; l < n; l++)" NL,
104     "  {" NL,
105     "    for(int k = 0; k < n; k++)" NL,
106     "    {" NL,
107     "      for(int i = 0; i < n; i++)" NL,
108     "      {" NL,
109     "        if (glob_size_arr[i] >= loc_size_arr[k])" NL,
110     "        {" NL,
111     "          ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
112     "loc_size_arr[k]);" NL,
113     "          int enq_res = enqueue_kernel(get_default_queue(), "
114     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
115     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
116     "        }" NL,
117     "      }" NL,
118     "    }" NL,
119     "  }" NL,
120     "}" NL
121 };
122 
123 static const char *helper_ndrange_2d_glo[] = {
124     NL,
125     "void block_fn(int len, __global atomic_uint* val)" NL,
126     "{" NL,
127     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
128     "memory_order_relaxed, memory_scope_device);" NL,
129     "}" NL,
130     "" NL,
131     "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
132     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
133     "val,  __global uint* ofs_arr)" NL,
134     "{" NL,
135     "  size_t tid = get_global_id(0);" NL,
136     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
137     "" NL,
138     "  for(int i = 0; i < n; i++)" NL,
139     "  {" NL,
140     "    size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
141     "};" NL,
142     "    ndrange_t ndrange = ndrange_2D(glob_size);" NL,
143     "    int enq_res = enqueue_kernel(get_default_queue(), "
144     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
145     "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
146     "  }" NL,
147     "}" NL
148 };
149 
150 static const char *helper_ndrange_2d_loc[] = {
151     NL,
152     "void block_fn(int len, __global atomic_uint* val)" NL,
153     "{" NL,
154     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
155     "memory_order_relaxed, memory_scope_device);" NL,
156     "}" NL,
157     "" NL,
158     "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
159     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
160     "val,  __global uint* ofs_arr)" NL,
161     "{" NL,
162     "  size_t tid = get_global_id(0);" NL,
163     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
164     "" NL,
165     "  for(int k = 0; k < n; k++)" NL,
166     "  {" NL,
167     "    for(int i = 0; i < n; i++)" NL,
168     "    {" NL,
169     "      if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
170     "      {" NL,
171     "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
172     "n] };" NL,
173     "        size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
174     "" NL,
175     "        ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
176     "        int enq_res = enqueue_kernel(get_default_queue(), "
177     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
178     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
179     "      }" NL,
180     "    }" NL,
181     "  }" NL,
182     "}" NL
183 };
184 
185 
186 static const char *helper_ndrange_2d_ofs[] = {
187     NL,
188     "void block_fn(int len, __global atomic_uint* val)" NL,
189     "{" NL,
190     "  atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
191     "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
192     "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
193     "}" NL,
194     "" NL,
195     "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
196     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
197     "val,  __global uint* ofs_arr)" NL,
198     "{" NL,
199     "  size_t tid = get_global_id(0);" NL,
200     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
201     "" NL,
202     "  for(int l = 0; l < n; l++)" NL,
203     "  {" NL,
204     "    for(int k = 0; k < n; k++)" NL,
205     "    {" NL,
206     "      for(int i = 0; i < n; i++)" NL,
207     "      {" NL,
208     "        if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
209     "        {" NL,
210     "          size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
211     "% n]};" NL,
212     "          size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
213     "          size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
214     "" NL,
215     "          ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
216     "          int enq_res = enqueue_kernel(get_default_queue(), "
217     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
218     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
219     "        }" NL,
220     "      }" NL,
221     "    }" NL,
222     "  }" NL,
223     "}" NL
224 };
225 
226 
227 static const char *helper_ndrange_3d_glo[] = {
228     NL,
229     "void block_fn(int len, __global atomic_uint* val)" NL,
230     "{" NL,
231     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
232     "memory_order_relaxed, memory_scope_device);" NL,
233     "}" NL,
234     "" NL,
235     "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
236     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
237     "val,  __global uint* ofs_arr)" NL,
238     "{" NL,
239     "  size_t tid = get_global_id(0);" NL,
240     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
241     "" NL,
242     "  for(int i = 0; i < n; i++)" NL,
243     "  {" NL,
244     "    uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
245     "n] * glob_size_arr[(i + 2) % n];" NL,
246     "    if (global_work_size <= (len * len))" NL,
247     "    {" NL,
248     "      size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
249     "n], glob_size_arr[(i + 2) % n] };" NL,
250     "      ndrange_t ndrange = ndrange_3D(glob_size);" NL,
251     "      int enq_res = enqueue_kernel(get_default_queue(), "
252     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
253     "      if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
254     "    }" NL,
255     "  }" NL,
256     "}" NL
257 };
258 
259 
260 static const char *helper_ndrange_3d_loc[] = {
261     NL,
262     "void block_fn(int len, __global atomic_uint* val)" NL,
263     "{" NL,
264     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
265     "memory_order_relaxed, memory_scope_device);" NL,
266     "}" NL,
267     "" NL,
268     "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
269     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
270     "val,  __global uint* ofs_arr)" NL,
271     "{" NL,
272     "  size_t tid = get_global_id(0);" NL,
273     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
274     "" NL,
275     "  for(int k = 0; k < n; k++)" NL,
276     "  {" NL,
277     "    for(int i = 0; i < n; i++)" NL,
278     "    {" NL,
279     "      uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
280     "n] * glob_size_arr[(i + 2) % n];" NL,
281     "      if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
282     "global_work_size <= (len * len))" NL,
283     "      {" NL,
284     "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
285     "n], glob_size_arr[(i + 2) % n] };" NL,
286     "        size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
287     "        ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
288     "        int enq_res = enqueue_kernel(get_default_queue(), "
289     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
290     "      " NL,
291     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
292     "      }" NL,
293     "    }" NL,
294     "  }" NL,
295     "}" NL
296 };
297 
298 static const char *helper_ndrange_3d_ofs[] = {
299     NL,
300     "void block_fn(int len, __global atomic_uint* val)" NL,
301     "{" NL,
302     "  atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
303     "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
304     "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
305     "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
306     "}" NL,
307     "" NL,
308     "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
309     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
310     "val,  __global uint* ofs_arr)" NL,
311     "{" NL,
312     "  size_t tid = get_global_id(0);" NL,
313     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
314     "" NL,
315     "  for(int l = 0; l < n; l++)" NL,
316     "  {" NL,
317     "    for(int k = 0; k < n; k++)" NL,
318     "    {" NL,
319     "      for(int i = 0; i < n; i++)" NL,
320     "      {" NL,
321     "        uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) "
322     "% n] * glob_size_arr[(i + 2) % n];" NL,
323     "        if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
324     "global_work_size <= (len * len))" NL,
325     "        {" NL,
326     "          size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
327     "% n], glob_size_arr[(i + 2) % n]};" NL,
328     "          size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
329     "          size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
330     "+ 2) % n] };" NL,
331     "          ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
332     "          int enq_res = enqueue_kernel(get_default_queue(), "
333     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
334     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
335     "        }" NL,
336     "      }" NL,
337     "    }" NL,
338     "  }" NL,
339     "}" NL
340 };
341 
342 static const kernel_src_dim_check sources_ndrange_Xd[] =
343 {
344     { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
345     { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
346     { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
347     { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
348     { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
349     { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
350     { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
351     { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
352     { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
353 };
354 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
355 
check_kernel_results(cl_int * results,cl_int len)356 static int check_kernel_results(cl_int* results, cl_int len)
357 {
358     for(cl_int i = 0; i < len; ++i)
359     {
360         if(results[i] != 0) return i;
361     }
362     return -1;
363 }
364 
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)365 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
366 {
367     for (size_t g = 0; g < glob_size_arr.size(); ++g)
368     {
369         for (size_t w = 0; w < glob_size_arr[g]; ++w)
370         {
371             ++reference_results[w];
372         }
373     }
374 }
375 
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)376 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
377 {
378     for (size_t g = 0; g < glob_size_arr.size(); ++g)
379     {
380         for (size_t l = 0; l < loc_size_arr.size(); ++l)
381         {
382             if (glob_size_arr[g] >= loc_size_arr[l])
383             {
384                 for (size_t w = 0; w < glob_size_arr[g]; ++w)
385                 {
386                     ++reference_results[w];
387                 }
388             }
389         }
390     }
391 }
392 
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)393 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
394 {
395     for (size_t g = 0; g < glob_size_arr.size(); ++g)
396     {
397         for (size_t l = 0; l < loc_size_arr.size(); ++l)
398         {
399             if (glob_size_arr[g] >= loc_size_arr[l])
400             {
401                 for (size_t o = 0; o < offset.size(); ++o)
402                 {
403                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
404                     {
405                         ++reference_results[(offset[o] + w) % len];
406                     }
407                 }
408             }
409         }
410     }
411 }
412 
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)413 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
414 {
415     for (size_t g = 0; g < glob_size_arr.size(); ++g)
416     {
417         for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
418         {
419             for (size_t w = 0; w < glob_size_arr[g]; ++w)
420             {
421                 ++reference_results[(h * glob_size_arr[g] + w) % len];
422             }
423         }
424     }
425 }
426 
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)427 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
428 {
429     size_t n = glob_size_arr.size();
430     for (size_t g = 0; g < glob_size_arr.size(); ++g)
431     {
432         for (size_t l = 0; l < loc_size_arr.size(); ++l)
433         {
434             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
435             {
436                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
437                 {
438                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
439                     {
440                         ++reference_results[(h * glob_size_arr[g] + w) % len];
441                     }
442                 }
443             }
444         }
445     }
446 }
447 
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)448 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
449 {
450     size_t n = glob_size_arr.size();
451     for (size_t g = 0; g < glob_size_arr.size(); ++g)
452     {
453         for (size_t l = 0; l < loc_size_arr.size(); ++l)
454         {
455             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
456             {
457                 for (size_t o = 0; o < offset.size(); ++o)
458                 {
459                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
460                     {
461                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
462                         {
463                             ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
464                         }
465                     }
466                 }
467             }
468         }
469     }
470 }
471 
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)472 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
473 {
474     size_t n = glob_size_arr.size();
475     for (size_t g = 0; g < glob_size_arr.size(); ++g)
476     {
477         size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
478         if(global_work_size <= (len * len))
479         {
480             for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
481             {
482                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
483                 {
484                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
485                     {
486                         ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
487                     }
488                 }
489             }
490         }
491     }
492 }
493 
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)494 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
495 {
496     size_t n = glob_size_arr.size();
497     for (size_t g = 0; g < glob_size_arr.size(); ++g)
498     {
499         for (size_t l = 0; l < loc_size_arr.size(); ++l)
500         {
501             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
502             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
503             {
504                 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
505                 {
506                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
507                     {
508                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
509                         {
510                             ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
511                         }
512                     }
513                 }
514             }
515         }
516     }
517 }
518 
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)519 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
520 {
521     size_t n = glob_size_arr.size();
522     for (size_t g = 0; g < glob_size_arr.size(); ++g)
523     {
524         for (size_t l = 0; l < loc_size_arr.size(); ++l)
525         {
526             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
527             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
528             {
529                 for (size_t o = 0; o < offset.size(); ++o)
530                 {
531                     for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
532                     {
533                         for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
534                         {
535                             for (size_t w = 0; w < glob_size_arr[g]; ++w)
536                             {
537                                 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
538                             }
539                         }
540                     }
541                 }
542             }
543         }
544     }
545 }
546 
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)547 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
548 {
549     std::vector<cl_int> reference_results(len, 0);
550     switch (dim)
551     {
552     case 1:
553         if (use_local == CL_FALSE)
554         {
555             generate_reference_1D(reference_results, glob_size_arr);
556         }
557         else if(use_local == CL_TRUE && use_offset == CL_FALSE)
558         {
559             generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
560         }
561         else
562         {
563             generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
564         }
565         break;
566     case 2:
567         if (use_local == CL_FALSE)
568         {
569             generate_reference_2D(reference_results, glob_size_arr, len);
570         }
571         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
572         {
573             generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
574         }
575         else
576         {
577             generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
578         }
579         break;
580     case 3:
581         if (use_local == CL_FALSE)
582         {
583             generate_reference_3D(reference_results, glob_size_arr, len);
584         }
585         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
586         {
587             generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
588         }
589         else
590         {
591             generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
592         }
593         break;
594     default:
595         return 0;
596         break;
597     }
598 
599     for (cl_int i = 0; i < len; ++i)
600     {
601         if (results[i] != reference_results[i])
602         {
603             log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
604             return i;
605         }
606     }
607 
608     return -1;
609 }
610 
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)611 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
612 {
613     MTdata d;
614     cl_uint i;
615     cl_int err_ret, res = 0;
616     clCommandQueueWrapper dev_queue;
617     cl_int k, kernel_results[MAX_GWS] = { 0 };
618 
619     size_t ret_len;
620     cl_uint max_queues = 1;
621     cl_uint maxQueueSize = 0;
622 
623     d = init_genrand(gRandomSeed);
624 
625     err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
626     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
627 
628     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
629     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
630 
631     size_t max_local_size = 1;
632     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
633     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
634 
635     cl_queue_properties queue_prop_def[] =
636     {
637         CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
638         CL_QUEUE_SIZE, maxQueueSize,
639         0
640     };
641 
642     dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
643     test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
644 
645     max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
646     if(gWimpyMode)
647     {
648         max_local_size = MIN(8, max_local_size);
649     }
650 
651     cl_uint num = 10;
652     cl_uint global_work_size = max_local_size * 2;
653     std::vector<cl_uint> glob_size_arr(num);
654     std::vector<cl_uint> loc_size_arr(num);
655     std::vector<cl_uint> ofs_arr(num);
656     std::vector<cl_int> glob_results(global_work_size, 0);
657 
658     glob_size_arr[0] = 1;
659     glob_size_arr[1] = global_work_size;
660     loc_size_arr[0] = 1;
661     loc_size_arr[1] = max_local_size;
662     ofs_arr[0] = 0;
663     ofs_arr[1] = 1;
664 
665     for(i = 2; i < num; ++i)
666     {
667         glob_size_arr[i] = genrand_int32(d) % global_work_size;
668         glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
669         loc_size_arr[i] = genrand_int32(d) % max_local_size;
670         loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
671         ofs_arr[i] = genrand_int32(d) % global_work_size;
672     }
673 
674     // check ndrange_dX functions
675     size_t failCnt = 0;
676     for(i = 0; i < num_kernels_ndrange_Xd; ++i)
677     {
678         if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
679             continue;
680 
681         clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
682         test_error(err_ret, "clCreateBuffer() failed");
683         clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
684         test_error(err_ret, "clCreateBuffer() failed");
685         clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
686         test_error(err_ret, "clCreateBuffer() failed");
687         clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
688         test_error(err_ret, "clCreateBuffer() failed");
689 
690         kernel_arg args[] =
691         {
692             { sizeof(cl_uint), &num },
693             { sizeof(cl_uint), &global_work_size },
694             { sizeof(cl_mem), &mem1 },
695             { sizeof(cl_mem), &mem2 },
696             { sizeof(cl_mem), &mem3 },
697             { sizeof(cl_mem), &mem4 },
698         };
699 
700         log_info("Running '%s' kernel (%d of %d) ...\n",  sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
701         err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
702 
703         cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
704         test_error(err_ret, "clEnqueueMapBuffer() failed");
705 
706         if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
707         else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
708         else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
709         else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
710 
711         err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
712         test_error(err_ret, "clEnqueueUnmapMemObject() failed");
713 
714     }
715 
716     if (failCnt > 0)
717     {
718         log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
719     }
720 
721     return res;
722 }
723 
724 
725 #endif
726 
727