1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20 
21 #include <vector>
22 
23 #include "procs.h"
24 #include "utils.h"
25 #include <time.h>
26 
27 
28 #ifdef CL_VERSION_2_0
29 extern int gWimpyMode;
30 static int nestingLevel = 3;
31 
32 static const char* enqueue_1D_wg_size_single[] =
33 {
34     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
35     NL, "{"
36     NL, "  size_t tidX = get_global_id(0);"
37     NL, "  queue_t def_q = get_default_queue();"
38     NL, "  if(--level < 0) return;"
39     NL, ""
40     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
41     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
42     NL, ""
43     NL, "  const size_t gs = 64 * 64 * 64;"
44     NL, "  size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
45     NL, "  ls = ls? ls: 1;"
46     NL, ""
47     NL, "  ndrange_t ndrange = ndrange_1D(gs, ls);"
48     NL, ""
49     NL, "  // Only 1 work-item enqueues block"
50     NL, "  if(tidX == 0)"
51     NL, "  {"
52     NL, "    atomic_inc(&res[tidX % maxGlobalWorkSize]);"
53     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
54     NL, "    if(enq_res != CLK_SUCCESS) { res[tidX % maxGlobalWorkSize] = -1; return; }"
55     NL, "  }"
56     NL, "}"
57     NL, ""
58     NL, "kernel void enqueue_1D_wg_size_single(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
59     NL, "{"
60     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
61     NL, "}"
62     NL
63 };
64 
check_single(cl_int * results,cl_int len,cl_int nesting_level)65 static int check_single(cl_int* results, cl_int len, cl_int nesting_level)
66 {
67     for(size_t i = 0; i < len; ++i)
68     {
69         if(i == 0 && results[i] != nestingLevel)
70         {
71             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], nestingLevel, i);
72             return (int)i;
73         }
74 
75         if(i > 0 && results[i] != 0)
76         {
77             log_error("ERROR: Kernel returned %d vs. expected 0, index: %d\n", results[i], i);
78             return (int)i;
79         }
80     }
81 
82     return -1;
83 }
84 
85 static const char* enqueue_1D_wg_size_some_eq[] =
86 {
87     NL, "void block_fn(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
88     NL, "{"
89     NL, "  size_t tidX = get_global_id(0);"
90     NL, "  queue_t def_q = get_default_queue();"
91     NL, "  if(--level < 0) return;"
92     NL, ""
93     NL, "  void (^kernelBlock)(void) = ^{ block_fn(res, level, maxGlobalWorkSize, rnd); };"
94     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
95     NL, ""
96     NL, "  const size_t gs = 8 * 8 * 2;"
97     NL, "  size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
98     NL, "  ls = ls? ls: 1;"
99     NL, ""
100     NL, "  ndrange_t ndrange = ndrange_1D(gs, ls);"
101     NL, ""
102     NL, "  // Some work-items enqueues nested blocks with the same level"
103     NL, "  if((tidX % (maxGlobalWorkSize / 8)) == 0)"
104     NL, "  {"
105     NL, "    atomic_inc(&res[tidX % maxGlobalWorkSize]);"
106     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
107     NL, "    if(enq_res != CLK_SUCCESS) { res[tidX % maxGlobalWorkSize] = -1; return; }"
108     NL, "  }"
109     NL, "}"
110     NL, ""
111     NL, "kernel void enqueue_1D_wg_size_some_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
112     NL, "{"
113     NL, "  block_fn(res, level, maxGlobalWorkSize, rnd);"
114     NL, "}"
115     NL
116 };
117 
generate_reference_results_some_eq_1D(std::vector<cl_int> & referenceResults,cl_int maxGlobalWorkSize,cl_int level)118 void generate_reference_results_some_eq_1D(std::vector<cl_int> &referenceResults, cl_int maxGlobalWorkSize, cl_int level)
119 {
120     size_t globalSize = (level == nestingLevel) ? maxGlobalWorkSize: (8 * 8 * 2);
121     if(--level < 0)
122     {
123         return;
124     }
125 
126     for (size_t tidX = 0; tidX < globalSize; ++tidX)
127     {
128         if ((tidX % (maxGlobalWorkSize / 8)) == 0)
129         {
130             ++referenceResults[tidX % maxGlobalWorkSize];
131             generate_reference_results_some_eq_1D(referenceResults, maxGlobalWorkSize, level);
132         }
133     }
134 }
135 
check_some_eq_1D(cl_int * results,cl_int len,cl_int nesting_level)136 static int check_some_eq_1D(cl_int* results, cl_int len, cl_int nesting_level)
137 {
138     std::vector<cl_int> referenceResults(len, 0);
139     generate_reference_results_some_eq_1D(referenceResults, len, nesting_level);
140 
141     for(size_t i = 0; i < len; ++i)
142     {
143         if (results[i] != referenceResults[i])
144         {
145             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
146             return (int)i;
147         }
148     }
149 
150     return -1;
151 }
152 
153 static const char* enqueue_1D_wg_size_some_diff[] =
154 {
155     NL, "void block_fn(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
156     NL, "{"
157     NL, "  size_t tidX = get_global_id(0);"
158     NL, "  queue_t def_q = get_default_queue();"
159     NL, "  if(--level < 0) return;"
160     NL, ""
161     NL, "  void (^kernelBlock)(void) = ^{ block_fn(res, level, maxGlobalWorkSize, rnd); };"
162     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
163     NL, ""
164     NL, "  const size_t gs = 8 * 8 * 8;"
165     NL, "  size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
166     NL, "  ls = ls? ls: 1;"
167     NL, ""
168     NL, "  ndrange_t ndrange = ndrange_1D(gs, ls);"
169     NL, ""
170     NL, "  // Some work-items enqueues nested blocks with different levels"
171     NL, "  if((tidX % 2) == 0)"
172     NL, "  {"
173     NL, "    atomic_inc(&res[tidX % maxGlobalWorkSize]);"
174     NL, "    if(level >= tidX)"
175     NL, "    {"
176     NL, "      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
177     NL, "      if(enq_res != CLK_SUCCESS) { res[tidX % maxGlobalWorkSize] = -1; return; }"
178     NL, "    }"
179     NL, "  }"
180     NL, "}"
181     NL, ""
182     NL, "kernel void enqueue_1D_wg_size_some_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
183     NL, "{"
184     NL, "  block_fn(res, level, maxGlobalWorkSize, rnd);"
185     NL, "}"
186     NL
187 };
188 
generate_reference_results_some_diff_1D(std::vector<cl_int> & referenceResults,cl_int maxGlobalWorkSize,cl_int level)189 void generate_reference_results_some_diff_1D(std::vector<cl_int> &referenceResults, cl_int maxGlobalWorkSize, cl_int level)
190 {
191     size_t globalSize = (level == nestingLevel) ? maxGlobalWorkSize: (8 * 8 * 8);
192     if(--level < 0)
193     {
194         return;
195     }
196 
197     for (size_t tidX = 0; tidX < globalSize; ++tidX)
198     {
199         if ((tidX % 2) == 0)
200         {
201             ++referenceResults[tidX % maxGlobalWorkSize];
202             if (level >= tidX)
203             {
204                 generate_reference_results_some_diff_1D(referenceResults, maxGlobalWorkSize, level);
205             }
206         }
207     }
208 }
209 
check_some_diff_1D(cl_int * results,cl_int maxGlobalWorkSize,cl_int nesting_level)210 static int check_some_diff_1D(cl_int* results, cl_int maxGlobalWorkSize, cl_int nesting_level)
211 {
212     std::vector<cl_int> referenceResults(maxGlobalWorkSize, 0);
213     generate_reference_results_some_diff_1D(referenceResults, maxGlobalWorkSize, nesting_level);
214 
215     for(size_t i = 0; i < maxGlobalWorkSize; ++i)
216     {
217         if (results[i] != referenceResults[i])
218         {
219             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
220             return (int)i;
221         }
222     }
223 
224     return -1;
225 }
226 
227 static const char* enqueue_1D_wg_size_all_eq[] =
228 {
229     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
230     NL, "{"
231     NL, "  size_t tidX = get_global_id(0);"
232     NL, "  queue_t def_q = get_default_queue();"
233     NL, "  if(--level < 0) return;"
234     NL, ""
235     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
236     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
237     NL, ""
238     NL, "  const size_t gs = 8;"
239     NL, "  size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
240     NL, "  ls = ls? ls: 1;"
241     NL, ""
242     NL, "  ndrange_t ndrange = ndrange_1D(gs, ls);"
243     NL, ""
244     NL, "  // All work-items enqueues nested blocks with the same level"
245     NL, "  atomic_inc(&res[tidX % maxGlobalWorkSize]);"
246     NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
247     NL, "  if(enq_res != CLK_SUCCESS) { res[tidX % maxGlobalWorkSize] = -1; return; }"
248     NL, "}"
249     NL, ""
250     NL, "kernel void enqueue_1D_wg_size_all_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
251     NL, "{"
252     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
253     NL, "}"
254     NL
255 };
256 
generate_reference_results_all_eq_1D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)257 void generate_reference_results_all_eq_1D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
258 {
259     size_t globalSize = (level == nestingLevel) ? len: 8;
260     if(--level < 0)
261     {
262         return;
263     }
264 
265     for (size_t tidX = 0; tidX < globalSize; ++tidX)
266     {
267         ++referenceResults[tidX % len];
268         generate_reference_results_all_eq_1D(referenceResults, len, level);
269     }
270 }
271 
check_all_eq_1D(cl_int * results,cl_int len,cl_int nesting_level)272 static int check_all_eq_1D(cl_int* results, cl_int len, cl_int nesting_level)
273 {
274     std::vector<cl_int> referenceResults(len, 0);
275     generate_reference_results_all_eq_1D(referenceResults, len, nesting_level);
276 
277     for(size_t i = 0; i < len; ++i)
278     {
279         if (results[i] != referenceResults[i])
280         {
281             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
282             return (int)i;
283         }
284     }
285 
286     return -1;
287 }
288 
289 static const char* enqueue_1D_wg_size_all_diff[] =
290 {
291     NL, "void block_fn(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
292     NL, "{"
293     NL, "  size_t tidX = get_global_id(0);"
294     NL, "  queue_t def_q = get_default_queue();"
295     NL, "  if((--level) < 0) return;"
296     NL, ""
297     NL, "  void (^kernelBlock)(void) = ^{ block_fn(res, level, maxGlobalWorkSize, rnd); };"
298     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
299     NL, ""
300     NL, "  const size_t gs = 8 * 8 * 8;"
301     NL, "  size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
302     NL, "  ls = ls? ls: 1;"
303     NL, ""
304     NL, "  ndrange_t ndrange = ndrange_1D(gs, ls);"
305     NL, ""
306     NL, "  // All work-items enqueues nested blocks with different levels"
307     NL, "  atomic_inc(&res[tidX % maxGlobalWorkSize]);"
308     NL, "  if(level >= tidX)"
309     NL, "  {"
310     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
311     NL, "    if(enq_res != CLK_SUCCESS) { res[tidX % maxGlobalWorkSize] = -1; return; }"
312     NL, "  }"
313     NL, "}"
314     NL, ""
315     NL, "kernel void enqueue_1D_wg_size_all_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
316     NL, "{"
317     NL, "  block_fn(res, level, maxGlobalWorkSize, rnd);"
318     NL, "}"
319     NL
320 };
321 
generate_reference_results_all_diff_1D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)322 void generate_reference_results_all_diff_1D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
323 {
324     size_t globalSize = (level == nestingLevel) ? len: (8 * 8 * 8);
325     if((--level) < 0)
326     {
327         return;
328     }
329 
330     for (size_t threadIdx = 0; threadIdx < globalSize; ++threadIdx)
331     {
332         ++referenceResults[threadIdx % len];
333         if (level >= threadIdx)
334         {
335             generate_reference_results_all_diff_1D(referenceResults, len, level);
336         }
337     }
338 }
339 
check_all_diff_1D(cl_int * results,cl_int len,cl_int nesting_level)340 static int check_all_diff_1D(cl_int* results, cl_int len, cl_int nesting_level)
341 {
342     std::vector<cl_int> referenceResults(len, 0);
343     generate_reference_results_all_diff_1D(referenceResults, len, nesting_level);
344 
345     for(size_t i = 0; i < len; ++i)
346     {
347         if (results[i] != referenceResults[i])
348         {
349             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
350             return (int)i;
351         }
352     }
353 
354     return -1;
355 }
356 
357 static const char* enqueue_2D_wg_size_single[] =
358 {
359     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
360     NL, "{"
361     NL, "  size_t tidX = get_global_id(0);"
362     NL, "  size_t tidY = get_global_id(1);"
363     NL, "  size_t linearId = get_global_linear_id();"
364     NL, "  queue_t def_q = get_default_queue();"
365     NL, "  if(--level < 0) return;"
366     NL, ""
367     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
368     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
369     NL, ""
370     NL, "  const size_t gs[] = { 64, 64 * 64 };"
371     NL, "  size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
372     NL, "  ls[1] = ls[1]? ls[1]: 1;"
373     NL, "  "
374     NL, "  ndrange_t ndrange = ndrange_2D(gs, ls);"
375     NL, ""
376     NL, "  // Only 1 work-item enqueues block"
377     NL, "  if(tidX == 0 && tidY == 0)"
378     NL, "  {"
379     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
380     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
381     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
382     NL, "  }"
383     NL, "}"
384     NL, ""
385     NL, "kernel void enqueue_2D_wg_size_single(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
386     NL, "{"
387     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
388     NL, "}"
389     NL
390 };
391 
392 static const char* enqueue_2D_wg_size_some_eq[] =
393 {
394     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
395     NL, "{"
396     NL, "  size_t tidX = get_global_id(0);"
397     NL, "  size_t tidY = get_global_id(1);"
398     NL, "  size_t linearId = get_global_linear_id();"
399     NL, "  queue_t def_q = get_default_queue();"
400     NL, "  if(--level < 0) return;"
401     NL, ""
402     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
403     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
404     NL, ""
405     NL, "  const size_t gs[] = { 4, 4 };"
406     NL, "  size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
407     NL, "  ls[1] = ls[1]? ls[1]: 1;"
408     NL, "  "
409     NL, "  ndrange_t ndrange = ndrange_2D(gs, ls);"
410     NL, ""
411     NL, "  // Some work-items enqueues nested blocks with the same level"
412     NL, "  if((tidX < (get_global_size(0) >> 1)) && ((tidY < (get_global_size(1) >> 1)) || get_global_size(1) == 1))"
413     NL, "  {"
414     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
415     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
416     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
417     NL, "  }"
418     NL, "}"
419     NL, ""
420     NL, "kernel void enqueue_2D_wg_size_some_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
421     NL, "{"
422     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
423     NL, "}"
424     NL
425 };
426 
generate_reference_results_some_eq_2D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)427 void generate_reference_results_some_eq_2D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
428 {
429     size_t globalSizeX = (level == nestingLevel) ? len: 4;
430     size_t globalSizeY = (level == nestingLevel) ? 1: 4;
431     if(--level < 0)
432     {
433         return;
434     }
435 
436     for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
437     {
438         for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
439         {
440             if ((tidX < (globalSizeX >> 1)) && ((tidY < (globalSizeY >> 1)) || globalSizeY == 1))
441             {
442                 ++referenceResults[(globalSizeX * tidY + tidX) % len];
443                 generate_reference_results_some_eq_2D(referenceResults, len, level);
444             }
445         }
446     }
447 }
448 
check_some_eq_2D(cl_int * results,cl_int len,cl_int nesting_level)449 static int check_some_eq_2D(cl_int* results, cl_int len, cl_int nesting_level)
450 {
451     std::vector<cl_int> referenceResults(len, 0);
452     generate_reference_results_some_eq_2D(referenceResults, len, nesting_level);
453 
454     for(size_t i = 0; i < len; ++i)
455     {
456         if (results[i] != referenceResults[i])
457         {
458             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
459             return (int)i;
460         }
461     }
462 
463     return -1;
464 }
465 
466 static const char* enqueue_2D_wg_size_some_diff[] =
467 {
468     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
469     NL, "{"
470     NL, "  size_t tidX = get_global_id(0);"
471     NL, "  size_t tidY = get_global_id(1);"
472     NL, "  size_t linearId = get_global_linear_id();"
473     NL, "  queue_t def_q = get_default_queue();"
474     NL, "  if(--level < 0) return;"
475     NL, ""
476     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
477     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
478     NL, ""
479     NL, "  const size_t gs[] = { 8, 8 };"
480     NL, "  size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
481     NL, "  ls[1] = ls[1]? ls[1]: 1;"
482     NL, "  "
483     NL, "  ndrange_t ndrange = ndrange_2D(gs, ls);"
484     NL, ""
485     NL, "  // Some work-items enqueues nested blocks with different levels"
486     NL, "  if((tidX % 2) == 0 && (tidY % 2) == 0)"
487     NL, "  {"
488     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
489     NL, "    if(level >= tidX && level >= tidY)"
490     NL, "    {"
491     NL, "      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
492     NL, "      if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
493     NL, "    }"
494     NL, "  }"
495     NL, "}"
496     NL, ""
497     NL, "kernel void enqueue_2D_wg_size_some_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
498     NL, "{"
499     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
500     NL, "}"
501     NL
502 };
503 
generate_reference_results_some_diff_2D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)504 void generate_reference_results_some_diff_2D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
505 {
506     size_t globalSizeX = (level == nestingLevel) ? len: 8;
507     size_t globalSizeY = (level == nestingLevel) ? 1: 8;
508     if(--level < 0)
509     {
510         return;
511     }
512 
513     for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
514     {
515         for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
516         {
517             if ((tidX % 2) == 0 && (tidY % 2) == 0)
518             {
519                 ++referenceResults[(globalSizeX * tidY + tidX) % len];
520                 if (level >= tidX && level >= tidY)
521                 {
522                     generate_reference_results_some_diff_2D(referenceResults, len, level);
523                 }
524             }
525         }
526     }
527 }
528 
check_some_diff_2D(cl_int * results,cl_int len,cl_int nesting_level)529 static int check_some_diff_2D(cl_int* results, cl_int len, cl_int nesting_level)
530 {
531     std::vector<cl_int> referenceResults(len, 0);
532     generate_reference_results_some_diff_2D(referenceResults, len, nesting_level);
533 
534     for(size_t i = 0; i < len; ++i)
535     {
536         if (results[i] != referenceResults[i])
537         {
538             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
539             return (int)i;
540         }
541     }
542 
543     return -1;
544 }
545 
546 static const char* enqueue_2D_wg_size_all_eq[] =
547 {
548     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
549     NL, "{"
550     NL, "  size_t tidX = get_global_id(0);"
551     NL, "  size_t tidY = get_global_id(1);"
552     NL, "  size_t linearId = get_global_linear_id();"
553     NL, "  queue_t def_q = get_default_queue();"
554     NL, "  if(--level < 0) return;"
555     NL, ""
556     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
557     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
558     NL, ""
559     NL, "  const size_t gs[] = { 2, 2 };"
560     NL, "  size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
561     NL, "  ls[1] = ls[1]? ls[1]: 1;"
562     NL, "  "
563     NL, "  ndrange_t ndrange = ndrange_2D(gs, ls);"
564     NL, ""
565     NL, "  // All work-items enqueues nested blocks with the same level"
566     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
567     NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
568     NL, "  if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
569     NL, "}"
570     NL, ""
571     NL, "kernel void enqueue_2D_wg_size_all_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
572     NL, "{"
573     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
574     NL, "}"
575     NL
576 };
577 
generate_reference_results_all_eq_2D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)578 void generate_reference_results_all_eq_2D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
579 {
580     size_t globalSizeX = (level == nestingLevel) ? len: 2;
581     size_t globalSizeY = (level == nestingLevel) ? 1: 2;
582     if(--level < 0)
583     {
584         return;
585     }
586 
587     for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
588     {
589         for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
590         {
591             ++referenceResults[(globalSizeX * tidY + tidX) % len];
592             generate_reference_results_all_eq_2D(referenceResults, len, level);
593         }
594     }
595 }
596 
check_all_eq_2D(cl_int * results,cl_int len,cl_int nesting_level)597 static int check_all_eq_2D(cl_int* results, cl_int len, cl_int nesting_level)
598 {
599     std::vector<cl_int> referenceResults(len, 0);
600     generate_reference_results_all_eq_2D(referenceResults, len, nesting_level);
601 
602     for(size_t i = 0; i < len; ++i)
603     {
604         if (results[i] != referenceResults[i])
605         {
606             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
607             return (int)i;
608         }
609     }
610 
611     return -1;
612 }
613 
614 static const char* enqueue_2D_wg_size_all_diff[] =
615 {
616     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
617     NL, "{"
618     NL, "  size_t tidX = get_global_id(0);"
619     NL, "  size_t tidY = get_global_id(1);"
620     NL, "  size_t linearId = get_global_linear_id();"
621     NL, "  queue_t def_q = get_default_queue();"
622     NL, "  if(--level < 0) return;"
623     NL, ""
624     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
625     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
626     NL, ""
627     NL, "  size_t gs[] = { 8, 8 * 8 };"
628     NL, "  size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
629     NL, "  ls[1] = ls[1]? ls[1]: 1;"
630     NL, "  "
631     NL, "  ndrange_t ndrange = ndrange_2D(gs, ls);"
632     NL, ""
633     NL, "  // All work-items enqueues nested blocks with different levels"
634     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
635     NL, "  if(level >= tidX && level >= tidY)"
636     NL, "  {"
637     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
638     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
639     NL, "  }"
640     NL, "}"
641     NL, ""
642     NL, "kernel void enqueue_2D_wg_size_all_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
643     NL, "{"
644     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
645     NL, "}"
646     NL
647 };
648 
generate_reference_results_all_diff_2D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)649 void generate_reference_results_all_diff_2D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
650 {
651     size_t globalSizeX = (level == nestingLevel) ? len: 8;
652     size_t globalSizeY = (level == nestingLevel) ? 1: (8 * 8);
653     if(--level < 0)
654     {
655         return;
656     }
657 
658     for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
659     {
660         for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
661         {
662             ++referenceResults[(globalSizeX * tidY + tidX) % len];
663             if (level >= tidX && level >= tidY)
664             {
665                 generate_reference_results_all_diff_2D(referenceResults, len, level);
666             }
667         }
668     }
669 }
670 
check_all_diff_2D(cl_int * results,cl_int len,cl_int nesting_level)671 static int check_all_diff_2D(cl_int* results, cl_int len, cl_int nesting_level)
672 {
673     std::vector<cl_int> referenceResults(len, 0);
674     generate_reference_results_all_diff_2D(referenceResults, len, nesting_level);
675 
676     for(size_t i = 0; i < len; ++i)
677     {
678         if (results[i] != referenceResults[i])
679         {
680             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
681             return (int)i;
682         }
683     }
684 
685     return -1;
686 }
687 
688 static const char* enqueue_3D_wg_size_single[] =
689 {
690     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
691     NL, "{"
692     NL, "  size_t tidX = get_global_id(0);"
693     NL, "  size_t tidY = get_global_id(1);"
694     NL, "  size_t tidZ = get_global_id(2);"
695     NL, "  size_t linearId = get_global_linear_id();"
696     NL, "  queue_t def_q = get_default_queue();"
697     NL, "  if(--level < 0) return;"
698     NL, ""
699     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
700     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
701     NL, ""
702     NL, "  const size_t gs[] = { 64, 64, 64 };"
703     NL, "  size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
704     NL, "  ls[2] = ls[2]? ls[2]: 1;"
705     NL, "  "
706     NL, "  ndrange_t ndrange = ndrange_3D(gs, ls);"
707     NL, ""
708     NL, "  // Only 1 work-item enqueues block"
709     NL, "  if(tidX == 0 && tidY == 0 && tidZ == 0)"
710     NL, "  {"
711     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
712     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
713     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
714     NL, "  }"
715     NL, "}"
716     NL, ""
717     NL, "kernel void enqueue_3D_wg_size_single(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
718     NL, "{"
719     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
720     NL, "}"
721     NL
722 };
723 
724 static const char* enqueue_3D_wg_size_some_eq[] =
725 {
726     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
727     NL, "{"
728     NL, "  size_t tidX = get_global_id(0);"
729     NL, "  size_t tidY = get_global_id(1);"
730     NL, "  size_t tidZ = get_global_id(2);"
731     NL, "  size_t linearId = get_global_linear_id();"
732     NL, "  queue_t def_q = get_default_queue();"
733     NL, "  if(--level < 0) return;"
734     NL, ""
735     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
736     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
737     NL, ""
738     NL, "  const size_t gs[] = { 4, 4, 4 };"
739     NL, "  size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
740     NL, "  ls[2] = ls[2]? ls[2]: 1;"
741     NL, "  "
742     NL, "  ndrange_t ndrange = ndrange_3D(gs, ls);"
743     NL, ""
744     NL, "  // Some work-items enqueues nested blocks with the same level"
745     NL, "  if((tidX < (get_global_size(0) >> 1)) && "
746     NL, "    ((tidY < (get_global_size(1) >> 1)) || get_global_size(1) == 1) &&"
747     NL, "    ((tidZ < (get_global_size(2) >> 1)) || get_global_size(2) == 1))"
748     NL, "  {"
749     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
750     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
751     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
752     NL, "  }"
753     NL, "}"
754     NL, ""
755     NL, "kernel void enqueue_3D_wg_size_some_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
756     NL, "{"
757     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
758     NL, "}"
759     NL
760 };
761 
generate_reference_results_some_eq_3D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)762 void generate_reference_results_some_eq_3D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
763 {
764     size_t globalSizeX = (level == nestingLevel) ? len: 4;
765     size_t globalSizeY = (level == nestingLevel) ? 1: 4;
766     size_t globalSizeZ = (level == nestingLevel) ? 1: 4;
767     if(--level < 0)
768     {
769         return;
770     }
771 
772     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
773     {
774         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
775         {
776             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
777             {
778                 if ((tidX < (globalSizeX >> 1)) && ((tidY < (globalSizeY >> 1)) || globalSizeY == 1) && ((tidZ < (globalSizeZ >> 1)) || globalSizeZ == 1))
779                 {
780                     ++referenceResults[(globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX) % len];
781                     generate_reference_results_some_eq_3D(referenceResults, len, level);
782                 }
783             }
784         }
785     }
786 }
787 
check_some_eq_3D(cl_int * results,cl_int len,cl_int nesting_level)788 static int check_some_eq_3D(cl_int* results, cl_int len, cl_int nesting_level)
789 {
790     std::vector<cl_int> referenceResults(len, 0);
791     generate_reference_results_some_eq_3D(referenceResults, len, nesting_level);
792 
793     for(size_t i = 0; i < len; ++i)
794     {
795         if (results[i] != referenceResults[i])
796         {
797             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
798             return (int)i;
799         }
800     }
801 
802     return -1;
803 }
804 
805 static const char* enqueue_3D_wg_size_some_diff[] =
806 {
807     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
808     NL, "{"
809     NL, "  size_t tidX = get_global_id(0);"
810     NL, "  size_t tidY = get_global_id(1);"
811     NL, "  size_t tidZ = get_global_id(2);"
812     NL, "  size_t linearId = get_global_linear_id();"
813     NL, "  queue_t def_q = get_default_queue();"
814     NL, "  if(--level < 0) return;"
815     NL, ""
816     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
817     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
818     NL, ""
819     NL, "  const size_t gs[] = { 8, 8, 8 };"
820     NL, "  size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
821     NL, "  ls[2] = ls[2]? ls[2]: 1;"
822     NL, "  "
823     NL, "  ndrange_t ndrange = ndrange_3D(gs, ls);"
824     NL, ""
825     NL, "  // Some work-items enqueues nested blocks with different levels"
826     NL, "  if((tidX % 2) == 0 && (tidY % 2) == 0 && (tidZ % 2) == 0)"
827     NL, "  {"
828     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
829     NL, "    if(level >= tidX && level >= tidY && level >= tidZ)"
830     NL, "    {"
831     NL, "      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
832     NL, "      if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
833     NL, "    }"
834     NL, "  }"
835     NL, "}"
836     NL, ""
837     NL, "kernel void enqueue_3D_wg_size_some_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
838     NL, "{"
839     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
840     NL, "}"
841     NL
842 };
843 
generate_reference_results_some_diff_3D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)844 void generate_reference_results_some_diff_3D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
845 {
846     size_t globalSizeX = (level == nestingLevel) ? len: 8;
847     size_t globalSizeY = (level == nestingLevel) ? 1: 8;
848     size_t globalSizeZ = (level == nestingLevel) ? 1: 8;
849     if(--level < 0)
850     {
851         return;
852     }
853 
854     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
855     {
856         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
857         {
858             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
859             {
860                 if ((tidX % 2) == 0 && (tidY % 2) == 0 && (tidZ % 2) == 0)
861                 {
862                     ++referenceResults[(globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX) % len];
863                     if (level >= tidX && level >= tidY && level >= tidZ)
864                     {
865                         generate_reference_results_some_diff_3D(referenceResults, len, level);
866                     }
867                 }
868             }
869         }
870     }
871 }
872 
check_some_diff_3D(cl_int * results,cl_int len,cl_int nesting_level)873 static int check_some_diff_3D(cl_int* results, cl_int len, cl_int nesting_level)
874 {
875     std::vector<cl_int> referenceResults(len, 0);
876     generate_reference_results_some_diff_3D(referenceResults, len, nesting_level);
877 
878     for(size_t i = 0; i < len; ++i)
879     {
880         if (results[i] != referenceResults[i])
881         {
882             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
883             return (int)i;
884         }
885     }
886 
887     return -1;
888 }
889 
890 static const char* enqueue_3D_wg_size_all_eq[] =
891 {
892     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
893     NL, "{"
894     NL, "  size_t tidX = get_global_id(0);"
895     NL, "  size_t tidY = get_global_id(1);"
896     NL, "  size_t tidZ = get_global_id(2);"
897     NL, "  size_t linearId = get_global_linear_id();"
898     NL, "  queue_t def_q = get_default_queue();"
899     NL, "  if(--level < 0) return;"
900     NL, ""
901     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
902     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
903     NL, ""
904     NL, "  const size_t gs[] = { 2, 2, 2 };"
905     NL, "  size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
906     NL, "  ls[2] = ls[2]? ls[2]: 1;"
907     NL, "  "
908     NL, "  ndrange_t ndrange = ndrange_3D(gs, ls);"
909     NL, ""
910     NL, "  // All work-items enqueues nested blocks with the same level"
911     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
912     NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
913     NL, "  if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
914     NL, "}"
915     NL, ""
916     NL, "kernel void enqueue_3D_wg_size_all_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
917     NL, "{"
918     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
919     NL, "}"
920     NL
921 };
922 
generate_reference_results_all_eq_3D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)923 void generate_reference_results_all_eq_3D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
924 {
925     size_t globalSizeX = (level == nestingLevel) ? len: 2;
926     size_t globalSizeY = (level == nestingLevel) ? 1: 2;
927     size_t globalSizeZ = (level == nestingLevel) ? 1: 2;
928     if(--level < 0)
929     {
930         return;
931     }
932 
933     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
934     {
935         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
936         {
937             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
938             {
939                 ++referenceResults[(globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX) % len];
940                 generate_reference_results_all_eq_3D(referenceResults, len, level);
941             }
942         }
943     }
944 }
945 
check_all_eq_3D(cl_int * results,cl_int len,cl_int nesting_level)946 static int check_all_eq_3D(cl_int* results, cl_int len, cl_int nesting_level)
947 {
948     std::vector<cl_int> referenceResults(len, 0);
949     generate_reference_results_all_eq_3D(referenceResults, len, nesting_level);
950 
951     for(size_t i = 0; i < len; ++i)
952     {
953         if (results[i] != referenceResults[i])
954         {
955             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
956             return (int)i;
957         }
958     }
959 
960     return -1;
961 }
962 
963 static const char* enqueue_3D_wg_size_all_diff[] =
964 {
965     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
966     NL, "{"
967     NL, "  size_t tidX = get_global_id(0);"
968     NL, "  size_t tidY = get_global_id(1);"
969     NL, "  size_t tidZ = get_global_id(2);"
970     NL, "  size_t linearId = get_global_linear_id();"
971     NL, "  queue_t def_q = get_default_queue();"
972     NL, "  if(--level < 0) return;"
973     NL, ""
974     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
975     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
976     NL, ""
977     NL, "  const size_t gs[] = { 8, 8, 8 };"
978     NL, "  size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
979     NL, "  ls[2] = ls[2]? ls[2]: 1;"
980     NL, "  "
981     NL, "  ndrange_t ndrange = ndrange_3D(gs, ls);"
982     NL, ""
983     NL, "  // All work-items enqueues nested blocks with different levels"
984     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
985     NL, "  if(level >= tidX && level >= tidY && level >= tidZ)"
986     NL, "  {"
987     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
988     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
989     NL, "  }"
990     NL, "}"
991     NL, ""
992     NL, "kernel void enqueue_3D_wg_size_all_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
993     NL, "{"
994     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
995     NL, "}"
996     NL
997 };
998 
generate_reference_results_all_diff_3D(std::vector<cl_int> & referenceResults,cl_int len,cl_int level)999 void generate_reference_results_all_diff_3D(std::vector<cl_int> &referenceResults, cl_int len, cl_int level)
1000 {
1001     size_t globalSizeX = (level == nestingLevel) ? len: 8;
1002     size_t globalSizeY = (level == nestingLevel) ? 1: 8;
1003     size_t globalSizeZ = (level == nestingLevel) ? 1: 8;
1004     if(--level < 0)
1005     {
1006         return;
1007     }
1008 
1009     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
1010     {
1011         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
1012         {
1013             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
1014             {
1015                 ++referenceResults[(globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX) % len];
1016                 if (level >= tidX && level >= tidY && level >= tidZ)
1017                 {
1018                     generate_reference_results_all_diff_3D(referenceResults, len, level);
1019                 }
1020             }
1021         }
1022     }
1023 }
1024 
check_all_diff_3D(cl_int * results,cl_int len,cl_int nesting_level)1025 static int check_all_diff_3D(cl_int* results, cl_int len, cl_int nesting_level)
1026 {
1027     std::vector<cl_int> referenceResults(len, 0);
1028     generate_reference_results_all_diff_3D(referenceResults, len, nesting_level);
1029 
1030     for(size_t i = 0; i < len; ++i)
1031     {
1032         if (results[i] != referenceResults[i])
1033         {
1034             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
1035             return (int)i;
1036         }
1037     }
1038 
1039     return -1;
1040 }
1041 
1042 static const char* enqueue_mix_wg_size_single[] =
1043 {
1044     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
1045     NL, "{"
1046     NL, "  size_t tidX = get_global_id(0);"
1047     NL, "  size_t tidY = get_global_id(1);"
1048     NL, "  size_t tidZ = get_global_id(2);"
1049     NL, "  size_t linearId = get_global_linear_id();"
1050     NL, "  queue_t def_q = get_default_queue();"
1051     NL, "  if(--level < 0) return;"
1052     NL, ""
1053     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
1054     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
1055     NL, ""
1056     NL, "  ndrange_t ndrange;"
1057     NL, "  switch((linearId + level) % 3)"
1058     NL, "  {"
1059     NL, "    case 0:"
1060     NL, "      {"
1061     NL, "        const size_t gs = 64 * 64 * 64;"
1062     NL, "        size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
1063     NL, "        ls = ls? ls: 1;"
1064     NL, "        ndrange = ndrange_1D(gs, ls);"
1065     NL, "      }"
1066     NL, "      break;"
1067     NL, "    case 1:"
1068     NL, "      {"
1069     NL, "        const size_t gs[] = { 64, 64 * 64 };"
1070     NL, "        size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
1071     NL, "        ls[1] = ls[1]? ls[1]: 1;"
1072     NL, "        ndrange = ndrange_2D(gs, ls);"
1073     NL, "      }"
1074     NL, "      break;"
1075     NL, "    case 2:"
1076     NL, "      {"
1077     NL, "        const size_t gs[] = { 64, 64, 64 };"
1078     NL, "        size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
1079     NL, "        ls[2] = ls[2]? ls[2]: 1;"
1080     NL, "        ndrange = ndrange_3D(gs, ls);"
1081     NL, "      }"
1082     NL, "      break;"
1083     NL, "    default:"
1084     NL, "      break;"
1085     NL, "  }"
1086     NL, ""
1087     NL, "  // Only 1 work-item enqueues block"
1088     NL, "  if(tidX == 0 && (tidY == 0 || get_global_size(1) == 1) && (tidZ == 0 || get_global_size(2) == 1))"
1089     NL, "  {"
1090     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
1091     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
1092     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
1093     NL, "  }"
1094     NL, "}"
1095     NL, ""
1096     NL, "kernel void enqueue_mix_wg_size_single(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
1097     NL, "{"
1098     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
1099     NL, "}"
1100     NL
1101 };
1102 
1103 static const char* enqueue_mix_wg_size_some_eq[] =
1104 {
1105     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
1106     NL, "{"
1107     NL, "  queue_t def_q = get_default_queue();"
1108     NL, "  size_t tidX = get_global_id(0);"
1109     NL, "  size_t tidY = get_global_id(1);"
1110     NL, "  size_t tidZ = get_global_id(2);"
1111     NL, "  size_t linearId = get_global_linear_id();"
1112     NL, "  if(--level < 0) return;"
1113     NL, ""
1114     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
1115     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
1116     NL, ""
1117     NL, "  ndrange_t ndrange;"
1118     NL, "  switch((linearId + level) % 3)"
1119     NL, "  {"
1120     NL, "    case 0:"
1121     NL, "      {"
1122     NL, "        const size_t gs = 2 * 4 * 4;"
1123     NL, "        size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
1124     NL, "        ls = ls? ls: 1;"
1125     NL, "        ndrange = ndrange_1D(gs, ls);"
1126     NL, "      }"
1127     NL, "      break;"
1128     NL, "    case 1:"
1129     NL, "      {"
1130     NL, "        const size_t gs[] = { 2, 4 * 4 };"
1131     NL, "        size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
1132     NL, "        ls[1] = ls[1]? ls[1]: 1;"
1133     NL, "        ndrange = ndrange_2D(gs, ls);"
1134     NL, "      }"
1135     NL, "      break;"
1136     NL, "    case 2:"
1137     NL, "      {"
1138     NL, "        const size_t gs[] = { 2, 4, 4 };"
1139     NL, "        size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
1140     NL, "        ls[2] = ls[2]? ls[2]: 1;"
1141     NL, "        ndrange = ndrange_3D(gs, ls);"
1142     NL, "      }"
1143     NL, "      break;"
1144     NL, "    default:"
1145     NL, "      break;"
1146     NL, "  }"
1147     NL, ""
1148     NL, "  // Some work-items enqueues nested blocks with the same level"
1149     NL, "  size_t globalSizeX = get_global_size(0);"
1150     NL, "  size_t globalSizeY = get_global_size(1);"
1151     NL, "  size_t globalSizeZ = get_global_size(2);"
1152     NL, "  if((tidX < (globalSizeX >> 1)) && ((tidY < (globalSizeY >> 1)) || globalSizeY == 1) && ((tidZ < (globalSizeZ >> 1)) || globalSizeZ == 1))"
1153     NL, "  {"
1154     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
1155     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
1156     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
1157     NL, "  }"
1158     NL, "}"
1159     NL, ""
1160     NL, "kernel void enqueue_mix_wg_size_some_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
1161     NL, "{"
1162     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
1163     NL, "}"
1164     NL
1165 };
1166 
generate_reference_results_some_eq_mix(std::vector<cl_int> & referenceResults,cl_int len,cl_int level,cl_int dim)1167 void generate_reference_results_some_eq_mix(std::vector<cl_int> &referenceResults, cl_int len, cl_int level, cl_int dim)
1168 {
1169     size_t globalSizeX = 1, globalSizeY = 1, globalSizeZ = 1;
1170     switch (dim)
1171     {
1172     case 0:
1173       globalSizeX = (level == nestingLevel) ? len: (2 * 4 * 4);
1174       break;
1175     case 1:
1176       globalSizeX = 2;
1177       globalSizeY = 4 * 4;
1178       break;
1179     case 2:
1180       globalSizeX = 2;
1181       globalSizeY = 4;
1182       globalSizeZ = 4;
1183       break;
1184     default:
1185       break;
1186     }
1187 
1188     if(--level < 0)
1189     {
1190         return;
1191     }
1192 
1193     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
1194     {
1195         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
1196         {
1197             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
1198             {
1199                 size_t linearID = globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX;
1200                 cl_int nextDim = (linearID + level) % 3;
1201                 if ((tidX < (globalSizeX >> 1)) && ((tidY < (globalSizeY >> 1)) || globalSizeY == 1) && ((tidZ < (globalSizeZ >> 1)) || globalSizeZ == 1))
1202                 {
1203                     ++referenceResults[linearID % len];
1204                     generate_reference_results_some_eq_mix(referenceResults, len, level, nextDim);
1205                 }
1206             }
1207         }
1208     }
1209 }
1210 
check_some_eq_mix(cl_int * results,cl_int len,cl_int nesting_level)1211 static int check_some_eq_mix(cl_int* results, cl_int len, cl_int nesting_level)
1212 {
1213     std::vector<cl_int> referenceResults(len, 0);
1214     generate_reference_results_some_eq_mix(referenceResults, len, nesting_level, 0);
1215 
1216     for(size_t i = 0; i < len; ++i)
1217     {
1218         if (results[i] != referenceResults[i])
1219         {
1220             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
1221             return (int)i;
1222         }
1223     }
1224 
1225     return -1;
1226 }
1227 
1228 static const char* enqueue_mix_wg_size_some_diff[] =
1229 {
1230     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
1231     NL, "{"
1232     NL, "  queue_t def_q = get_default_queue();"
1233     NL, "  size_t tidX = get_global_id(0);"
1234     NL, "  size_t tidY = get_global_id(1);"
1235     NL, "  size_t tidZ = get_global_id(2);"
1236     NL, "  size_t linearId = get_global_linear_id();"
1237     NL, "  if(--level < 0) return;"
1238     NL, ""
1239     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
1240     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
1241     NL, ""
1242     NL, "  ndrange_t ndrange;"
1243     NL, "  switch((linearId + level) % 3)"
1244     NL, "  {"
1245     NL, "    case 0:"
1246     NL, "      {"
1247     NL, "        const size_t gs = 8 * 8 * 8;"
1248     NL, "        size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
1249     NL, "        ls = ls? ls: 1;"
1250     NL, "        ndrange = ndrange_1D(gs, ls);"
1251     NL, "      }"
1252     NL, "      break;"
1253     NL, "    case 1:"
1254     NL, "      {"
1255     NL, "        const size_t gs[] = { 8, 8 * 8 };"
1256     NL, "        size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
1257     NL, "        ls[1] = ls[1]? ls[1]: 1;"
1258     NL, "        ndrange = ndrange_2D(gs, ls);"
1259     NL, "      }"
1260     NL, "      break;"
1261     NL, "    case 2:"
1262     NL, "      {"
1263     NL, "        const size_t gs[] = { 8, 8, 8 };"
1264     NL, "        size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
1265     NL, "        ls[2] = ls[2]? ls[2]: 1;"
1266     NL, "        ndrange = ndrange_3D(gs, ls);"
1267     NL, "      }"
1268     NL, "      break;"
1269     NL, "    default:"
1270     NL, "      break;"
1271     NL, "  }"
1272     NL, ""
1273     NL, "  // Some work-items enqueues nested blocks with different levels"
1274     NL, "  if((tidX % 2) == 0 && (tidY % 2) == 0 && (tidZ % 2) == 0)"
1275     NL, "  {"
1276     NL, "    atomic_inc(&res[linearId % maxGlobalWorkSize]);"
1277     NL, "    if(level >= tidX && level >= tidY && level >= tidZ)"
1278     NL, "    {"
1279     NL, "      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
1280     NL, "      if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
1281     NL, "    }"
1282     NL, "  }"
1283     NL, "}"
1284     NL, ""
1285     NL, "kernel void enqueue_mix_wg_size_some_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
1286     NL, "{"
1287     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
1288     NL, "}"
1289     NL
1290 };
1291 
generate_reference_results_some_diff_mix(std::vector<cl_int> & referenceResults,cl_int len,cl_int level,cl_int dim)1292 void generate_reference_results_some_diff_mix(std::vector<cl_int> &referenceResults, cl_int len, cl_int level, cl_int dim)
1293 {
1294     size_t globalSizeX = 1, globalSizeY = 1, globalSizeZ = 1;
1295     switch (dim)
1296     {
1297     case 0:
1298       globalSizeX = (level == nestingLevel) ? len: (8 * 8 * 8);
1299       break;
1300     case 1:
1301       globalSizeX = 8;
1302       globalSizeY = 8 * 8;
1303       break;
1304     case 2:
1305       globalSizeX = 8;
1306       globalSizeY = 8;
1307       globalSizeZ = 8;
1308       break;
1309     default:
1310       return;
1311       break;
1312     }
1313 
1314     if(--level < 0)
1315     {
1316         return;
1317     }
1318 
1319     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
1320     {
1321         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
1322         {
1323             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
1324             {
1325                 size_t linearID = globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX;
1326                 cl_int nextDim = (linearID + level) % 3;
1327                 if ((tidX % 2) == 0 && (tidY % 2) == 0 && (tidZ % 2) == 0)
1328                 {
1329                     ++referenceResults[linearID % len];
1330                     if (level >= tidX && level >= tidY && level >= tidZ)
1331                     {
1332                         generate_reference_results_some_diff_mix(referenceResults, len, level, nextDim);
1333                     }
1334                 }
1335             }
1336         }
1337     }
1338 }
1339 
check_some_diff_mix(cl_int * results,cl_int len,cl_int nesting_level)1340 static int check_some_diff_mix(cl_int* results, cl_int len, cl_int nesting_level)
1341 {
1342     std::vector<cl_int> referenceResults(len, 0);
1343     generate_reference_results_some_diff_mix(referenceResults, len, nesting_level, 0);
1344 
1345     for(size_t i = 0; i < len; ++i)
1346     {
1347         if (results[i] != referenceResults[i])
1348         {
1349             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
1350             return (int)i;
1351         }
1352     }
1353 
1354     return -1;
1355 }
1356 
1357 static const char* enqueue_mix_wg_size_all_eq[] =
1358 {
1359     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
1360     NL, "{"
1361     NL, "  queue_t def_q = get_default_queue();"
1362     NL, "  size_t tidX = get_global_id(0);"
1363     NL, "  size_t tidY = get_global_id(1);"
1364     NL, "  size_t tidZ = get_global_id(2);"
1365     NL, "  size_t linearId = get_global_linear_id();"
1366     NL, "  if(--level < 0) return;"
1367     NL, ""
1368     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
1369     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
1370     NL, ""
1371     NL, "  ndrange_t ndrange;"
1372     NL, "  switch((linearId + level) % 3)"
1373     NL, "  {"
1374     NL, "    case 0:"
1375     NL, "      {"
1376     NL, "        const size_t gs = 2 * 2 * 2;"
1377     NL, "        size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
1378     NL, "        ls = ls? ls: 1;"
1379     NL, "        ndrange = ndrange_1D(gs, ls);"
1380     NL, "      }"
1381     NL, "      break;"
1382     NL, "    case 1:"
1383     NL, "      {"
1384     NL, "        const size_t gs[] = { 2, 2 * 2 };"
1385     NL, "        size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
1386     NL, "        ls[1] = ls[1]? ls[1]: 1;"
1387     NL, "        ndrange = ndrange_2D(gs, ls);"
1388     NL, "      }"
1389     NL, "      break;"
1390     NL, "    case 2:"
1391     NL, "      {"
1392     NL, "        const size_t gs[] = { 2, 2, 2 };"
1393     NL, "        size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
1394     NL, "        ls[2] = ls[2]? ls[2]: 1;"
1395     NL, "        ndrange = ndrange_3D(gs, ls);"
1396     NL, "      }"
1397     NL, "      break;"
1398     NL, "    default:"
1399     NL, "      break;"
1400     NL, "  }"
1401     NL, ""
1402     NL, "  // All work-items enqueues nested blocks with the same level"
1403     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
1404     NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
1405     NL, "  if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
1406     NL, "}"
1407     NL, ""
1408     NL, "kernel void enqueue_mix_wg_size_all_eq(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
1409     NL, "{"
1410     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
1411     NL, "}"
1412     NL
1413 };
1414 
generate_reference_results_all_eq_mix(std::vector<cl_int> & referenceResults,cl_int len,cl_int level,cl_int dim)1415 void generate_reference_results_all_eq_mix(std::vector<cl_int> &referenceResults, cl_int len, cl_int level, cl_int dim)
1416 {
1417     size_t globalSizeX = 1, globalSizeY = 1, globalSizeZ = 1;
1418     switch (dim)
1419     {
1420     case 0:
1421       globalSizeX = (level == nestingLevel) ? len: (2 * 2 * 2);
1422       break;
1423     case 1:
1424       globalSizeX = 2;
1425       globalSizeY = 2 * 2;
1426       break;
1427     case 2:
1428       globalSizeX = 2;
1429       globalSizeY = 2;
1430       globalSizeZ = 2;
1431       break;
1432     default:
1433       break;
1434     }
1435 
1436     if(--level < 0)
1437     {
1438         return;
1439     }
1440 
1441     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
1442     {
1443         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
1444         {
1445             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
1446             {
1447                 size_t linearID = globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX;
1448                 cl_int nextDim = (linearID + level) % 3;
1449                 ++referenceResults[linearID % len];
1450                 generate_reference_results_all_eq_mix(referenceResults, len, level, nextDim);
1451             }
1452         }
1453     }
1454 }
1455 
check_all_eq_mix(cl_int * results,cl_int len,cl_int nesting_level)1456 static int check_all_eq_mix(cl_int* results, cl_int len, cl_int nesting_level)
1457 {
1458     std::vector<cl_int> referenceResults(len, 0);
1459     generate_reference_results_all_eq_mix(referenceResults, len, nesting_level, 0);
1460 
1461     for(size_t i = 0; i < len; ++i)
1462     {
1463         if (results[i] != referenceResults[i])
1464         {
1465             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
1466             return (int)i;
1467         }
1468     }
1469 
1470     return -1;
1471 }
1472 
1473 static const char* enqueue_mix_wg_size_all_diff[] =
1474 {
1475     NL, "void block_fn(int level, int maxGlobalWorkSize, __global int* rnd, __global int* res)"
1476     NL, "{"
1477     NL, "  queue_t def_q = get_default_queue();"
1478     NL, "  size_t tidX = get_global_id(0);"
1479     NL, "  size_t tidY = get_global_id(1);"
1480     NL, "  size_t tidZ = get_global_id(2);"
1481     NL, "  size_t linearId = get_global_linear_id();"
1482     NL, "  if(--level < 0) return;"
1483     NL, ""
1484     NL, "  void (^kernelBlock)(void) = ^{ block_fn(level, maxGlobalWorkSize, rnd, res); };"
1485     NL, "  uint wg = get_kernel_work_group_size(kernelBlock);"
1486     NL, ""
1487     NL, "  ndrange_t ndrange;"
1488     NL, "  switch((linearId + level) % 3)"
1489     NL, "  {"
1490     NL, "    case 0:"
1491     NL, "      {"
1492     NL, "        const size_t gs = 8 * 8 * 8;"
1493     NL, "        size_t ls = rnd[tidX % maxGlobalWorkSize] % wg % gs;"
1494     NL, "        ls = ls? ls: 1;"
1495     NL, "        ndrange = ndrange_1D(gs, ls);"
1496     NL, "      }"
1497     NL, "      break;"
1498     NL, "    case 1:"
1499     NL, "      {"
1500     NL, "        const size_t gs[] = { 8, 8 * 8 };"
1501     NL, "        size_t ls[] = { 1, rnd[tidY % maxGlobalWorkSize] % wg % gs[1] };"
1502     NL, "        ls[1] = ls[1]? ls[1]: 1;"
1503     NL, "        ndrange = ndrange_2D(gs, ls);"
1504     NL, "      }"
1505     NL, "      break;"
1506     NL, "    case 2:"
1507     NL, "      {"
1508     NL, "        const size_t gs[] = { 8, 8, 8 };"
1509     NL, "        size_t ls[] = { 1, 1, rnd[tidZ % maxGlobalWorkSize] % wg % gs[2] };"
1510     NL, "        ls[2] = ls[2]? ls[2]: 1;"
1511     NL, "        ndrange = ndrange_3D(gs, ls);"
1512     NL, "      }"
1513     NL, "      break;"
1514     NL, "    default:"
1515     NL, "      break;"
1516     NL, "  }"
1517     NL, ""
1518     NL, "  // All work-items enqueues nested blocks with different levels"
1519     NL, "  atomic_inc(&res[linearId % maxGlobalWorkSize]);"
1520     NL, "  if(level >= tidX && level >= tidY && level >= tidZ)"
1521     NL, "  {"
1522     NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
1523     NL, "    if(enq_res != CLK_SUCCESS) { res[linearId % maxGlobalWorkSize] = -1; return; }"
1524     NL, "  }"
1525     NL, "}"
1526     NL, ""
1527     NL, "kernel void enqueue_mix_wg_size_all_diff(__global int* res, int level, int maxGlobalWorkSize, __global int* rnd)"
1528     NL, "{"
1529     NL, "  block_fn(level, maxGlobalWorkSize, rnd, res);"
1530     NL, "}"
1531     NL
1532 };
1533 
generate_reference_results_all_diff_mix(std::vector<cl_int> & referenceResults,cl_int len,cl_int level,cl_int dim)1534 void generate_reference_results_all_diff_mix(std::vector<cl_int> &referenceResults, cl_int len, cl_int level, cl_int dim)
1535 {
1536     size_t globalSizeX = 1, globalSizeY = 1, globalSizeZ = 1;
1537     switch (dim)
1538     {
1539     case 0:
1540       globalSizeX = (level == nestingLevel) ? len: (8 * 8 * 8);
1541       break;
1542     case 1:
1543       globalSizeX = 8;
1544       globalSizeY = 8 * 8;
1545       break;
1546     case 2:
1547       globalSizeX = 8;
1548       globalSizeY = 8;
1549       globalSizeZ = 8;
1550       break;
1551     default:
1552       break;
1553     }
1554 
1555     if(--level < 0)
1556     {
1557         return;
1558     }
1559 
1560     for (size_t tidZ = 0; tidZ < globalSizeZ; ++tidZ)
1561     {
1562         for (size_t tidY = 0; tidY < globalSizeY; ++tidY)
1563         {
1564             for (size_t tidX = 0; tidX < globalSizeX; ++tidX)
1565             {
1566                 size_t linearID = globalSizeX * globalSizeY * tidZ + globalSizeX * tidY + tidX;
1567                 cl_int nextDim = (linearID + level) % 3;
1568                 ++referenceResults[linearID % len];
1569                 if (level >= tidX && level >= tidY && level >= tidZ)
1570                 {
1571                     generate_reference_results_all_diff_mix(referenceResults, len, level, nextDim);
1572                 }
1573             }
1574         }
1575     }
1576 }
1577 
check_all_diff_mix(cl_int * results,cl_int len,cl_int nesting_level)1578 static int check_all_diff_mix(cl_int* results, cl_int len, cl_int nesting_level)
1579 {
1580     std::vector<cl_int> referenceResults(len, 0);
1581     generate_reference_results_all_diff_mix(referenceResults, len, nesting_level, 0);
1582 
1583     for(size_t i = 0; i < len; ++i)
1584     {
1585         if (results[i] != referenceResults[i])
1586         {
1587             log_error("ERROR: Kernel returned %d vs. expected %d, index: %d\n", results[i], referenceResults[i], i);
1588             return (int)i;
1589         }
1590     }
1591 
1592     return -1;
1593 }
1594 
1595 static const kernel_src_check sources_enqueue_wg_size[] =
1596 {
1597     { KERNEL(enqueue_1D_wg_size_single), check_single },
1598     { KERNEL(enqueue_1D_wg_size_some_eq), check_some_eq_1D },
1599     { KERNEL(enqueue_1D_wg_size_some_diff), check_some_diff_1D },
1600     { KERNEL(enqueue_1D_wg_size_all_eq), check_all_eq_1D },
1601     { KERNEL(enqueue_1D_wg_size_all_diff), check_all_diff_1D },
1602 
1603     { KERNEL(enqueue_2D_wg_size_single), check_single },
1604     { KERNEL(enqueue_2D_wg_size_some_eq), check_some_eq_2D },
1605     { KERNEL(enqueue_2D_wg_size_some_diff), check_some_diff_2D },
1606     { KERNEL(enqueue_2D_wg_size_all_eq), check_all_eq_2D },
1607     { KERNEL(enqueue_2D_wg_size_all_diff), check_all_diff_2D },
1608 
1609     { KERNEL(enqueue_3D_wg_size_single), check_single },
1610     { KERNEL(enqueue_3D_wg_size_some_eq), check_some_eq_3D },
1611     { KERNEL(enqueue_3D_wg_size_some_diff), check_some_diff_3D },
1612     { KERNEL(enqueue_3D_wg_size_all_eq), check_all_eq_3D },
1613     { KERNEL(enqueue_3D_wg_size_all_diff), check_all_diff_3D },
1614 
1615     { KERNEL(enqueue_mix_wg_size_single), check_single },
1616     { KERNEL(enqueue_mix_wg_size_some_eq), check_some_eq_mix },
1617     { KERNEL(enqueue_mix_wg_size_some_diff), check_some_diff_mix },
1618     { KERNEL(enqueue_mix_wg_size_all_eq), check_all_eq_mix },
1619     { KERNEL(enqueue_mix_wg_size_all_diff), check_all_diff_mix }
1620 };
1621 
test_enqueue_wg_size(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)1622 int test_enqueue_wg_size(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
1623 {
1624     MTdata d;
1625     cl_uint i, k;
1626     cl_int err_ret, res = 0;
1627     clCommandQueueWrapper dev_queue;
1628     const cl_int MAX_GLOBAL_WORK_SIZE = MAX_GWS / 4;
1629     cl_int kernel_results[MAX_GLOBAL_WORK_SIZE] = { 0 };
1630     cl_uint vrnd[MAX_GLOBAL_WORK_SIZE] = { 0 };
1631 
1632     size_t ret_len;
1633     cl_uint max_queues = 1;
1634     cl_uint maxQueueSize = 0;
1635     d = init_genrand(gRandomSeed);
1636 
1637     if(gWimpyMode)
1638     {
1639         nestingLevel = 2;
1640         vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
1641         vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
1642     }
1643 
1644     err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
1645     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
1646 
1647     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
1648     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
1649 
1650     size_t max_local_size = 1;
1651     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
1652     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
1653 
1654     cl_queue_properties queue_prop_def[] =
1655     {
1656         CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
1657         CL_QUEUE_SIZE, maxQueueSize,
1658         0
1659     };
1660 
1661     dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
1662     test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
1663 
1664 
1665     size_t failCnt = 0;
1666     for(k = 0; k < arr_size(sources_enqueue_wg_size); ++k)
1667     {
1668         if (!gKernelName.empty() && gKernelName != sources_enqueue_wg_size[k].src.kernel_name)
1669             continue;
1670 
1671         log_info("Running '%s' kernel (%d of %d) ...\n", sources_enqueue_wg_size[k].src.kernel_name, k + 1, arr_size(sources_enqueue_wg_size));
1672         for(i = 0; i < MAX_GLOBAL_WORK_SIZE; ++i)
1673         {
1674             kernel_results[i] = 0;
1675             vrnd[i] = genrand_int32(d);
1676         }
1677 
1678         // Fill some elements with prime numbers
1679         cl_uint prime[] = { 3,   5,   7,  11,  13,  17,  19,  23,
1680             29,  31,  37,  41,  43,  47,  53,  59,
1681             61,  67,  71,  73,  79,  83,  89,  97,
1682             101, 103, 107, 109, 113, 127 };
1683 
1684         for(i = 0; i < arr_size(prime); ++i)
1685         {
1686             vrnd[genrand_int32(d) % MAX_GLOBAL_WORK_SIZE] = prime[i];
1687         }
1688 
1689         clMemWrapper mem;
1690         mem = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(vrnd), vrnd, &err_ret);
1691         test_error(err_ret, "clCreateBuffer() failed");
1692 
1693         kernel_arg args[] =
1694         {
1695             { sizeof(cl_uint), &nestingLevel },
1696             { sizeof(cl_uint), &MAX_GLOBAL_WORK_SIZE },
1697             { sizeof(cl_mem),  &mem }
1698         };
1699 
1700         size_t global_size = MAX_GLOBAL_WORK_SIZE;
1701         size_t local_size = (max_local_size > global_size) ? global_size : max_local_size;
1702 
1703         err_ret = run_n_kernel_args(context, queue, sources_enqueue_wg_size[k].src.lines, sources_enqueue_wg_size[k].src.num_lines, sources_enqueue_wg_size[k].src.kernel_name, local_size, global_size, kernel_results, sizeof(kernel_results), arr_size(args), args);
1704 
1705         //check results
1706         int fail = sources_enqueue_wg_size[k].check(kernel_results, global_size, nestingLevel);
1707 
1708         if(check_error(err_ret, "'%s' kernel execution failed", sources_enqueue_wg_size[k].src.kernel_name)) { ++failCnt; res = -1; continue; }
1709         else if(fail >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d]", sources_enqueue_wg_size[k].src.kernel_name, fail)) { ++failCnt; res = -1; continue; }
1710         else log_info("'%s' kernel is OK.\n", sources_enqueue_wg_size[k].src.kernel_name);
1711     }
1712 
1713     if (failCnt > 0)
1714     {
1715         log_error("ERROR: %d of %d kernels failed.\n", failCnt, arr_size(sources_enqueue_wg_size));
1716     }
1717 
1718     free_mtdata(d);
1719 
1720     return res;
1721 }
1722 
1723 #endif
1724 
1725