1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20
21 #include <vector>
22
23 #include "procs.h"
24 #include "utils.h"
25 #include <time.h>
26
27
28 #ifdef CL_VERSION_2_0
29 extern int gWimpyMode;
30 static const char *helper_ndrange_1d_glo[] = {
31 NL,
32 "void block_fn(int len, __global atomic_uint* val)" NL,
33 "{" NL,
34 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
35 "memory_order_relaxed, memory_scope_device);" NL,
36 "}" NL,
37 "" NL,
38 "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
39 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
40 "atomic_uint* val, __global uint* ofs_arr)" NL,
41 "{" NL,
42 " size_t tid = get_global_id(0);" NL,
43 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
44 "" NL,
45 " for(int i = 0; i < n; i++)" NL,
46 " {" NL,
47 " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
48 " int enq_res = enqueue_kernel(get_default_queue(), "
49 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
50 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
51 " }" NL,
52 "}" NL
53 };
54
55 static const char *helper_ndrange_1d_loc[] = {
56 NL,
57 "void block_fn(int len, __global atomic_uint* val)" NL,
58 "{" NL,
59 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
60 "memory_order_relaxed, memory_scope_device);" NL,
61 "}" NL,
62 "" NL,
63 "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
64 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
65 "atomic_uint* val, __global uint* ofs_arr)" NL,
66 "{" NL,
67 " size_t tid = get_global_id(0);" NL,
68 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
69 "" NL,
70 " for(int k = 0; k < n; k++)" NL,
71 " {" NL,
72 " for(int i = 0; i < n; i++)" NL,
73 " {" NL,
74 " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
75 " {" NL,
76 " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
77 "loc_size_arr[k]);" NL,
78 " int enq_res = enqueue_kernel(get_default_queue(), "
79 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
80 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
81 " }" NL,
82 " }" NL,
83 " }" NL,
84 "}" NL
85 };
86
87 static const char *helper_ndrange_1d_ofs[] = {
88 NL,
89 "void block_fn(int len, __global atomic_uint* val)" NL,
90 "{" NL,
91 " atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
92 "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
93 "memory_scope_device);" NL,
94 "}" NL,
95 "" NL,
96 "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
97 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
98 "atomic_uint* val, __global uint* ofs_arr)" NL,
99 "{" NL,
100 " size_t tid = get_global_id(0);" NL,
101 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
102 "" NL,
103 " for(int l = 0; l < n; l++)" NL,
104 " {" NL,
105 " for(int k = 0; k < n; k++)" NL,
106 " {" NL,
107 " for(int i = 0; i < n; i++)" NL,
108 " {" NL,
109 " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
110 " {" NL,
111 " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
112 "loc_size_arr[k]);" NL,
113 " int enq_res = enqueue_kernel(get_default_queue(), "
114 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
115 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
116 " }" NL,
117 " }" NL,
118 " }" NL,
119 " }" NL,
120 "}" NL
121 };
122
123 static const char *helper_ndrange_2d_glo[] = {
124 NL,
125 "void block_fn(int len, __global atomic_uint* val)" NL,
126 "{" NL,
127 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
128 "memory_order_relaxed, memory_scope_device);" NL,
129 "}" NL,
130 "" NL,
131 "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
132 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
133 "val, __global uint* ofs_arr)" NL,
134 "{" NL,
135 " size_t tid = get_global_id(0);" NL,
136 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
137 "" NL,
138 " for(int i = 0; i < n; i++)" NL,
139 " {" NL,
140 " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
141 "};" NL,
142 " ndrange_t ndrange = ndrange_2D(glob_size);" NL,
143 " int enq_res = enqueue_kernel(get_default_queue(), "
144 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
145 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
146 " }" NL,
147 "}" NL
148 };
149
150 static const char *helper_ndrange_2d_loc[] = {
151 NL,
152 "void block_fn(int len, __global atomic_uint* val)" NL,
153 "{" NL,
154 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
155 "memory_order_relaxed, memory_scope_device);" NL,
156 "}" NL,
157 "" NL,
158 "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
159 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
160 "val, __global uint* ofs_arr)" NL,
161 "{" NL,
162 " size_t tid = get_global_id(0);" NL,
163 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
164 "" NL,
165 " for(int k = 0; k < n; k++)" NL,
166 " {" NL,
167 " for(int i = 0; i < n; i++)" NL,
168 " {" NL,
169 " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
170 " {" NL,
171 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
172 "n] };" NL,
173 " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
174 "" NL,
175 " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
176 " int enq_res = enqueue_kernel(get_default_queue(), "
177 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
178 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
179 " }" NL,
180 " }" NL,
181 " }" NL,
182 "}" NL
183 };
184
185
186 static const char *helper_ndrange_2d_ofs[] = {
187 NL,
188 "void block_fn(int len, __global atomic_uint* val)" NL,
189 "{" NL,
190 " atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
191 "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
192 "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
193 "}" NL,
194 "" NL,
195 "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
196 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
197 "val, __global uint* ofs_arr)" NL,
198 "{" NL,
199 " size_t tid = get_global_id(0);" NL,
200 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
201 "" NL,
202 " for(int l = 0; l < n; l++)" NL,
203 " {" NL,
204 " for(int k = 0; k < n; k++)" NL,
205 " {" NL,
206 " for(int i = 0; i < n; i++)" NL,
207 " {" NL,
208 " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
209 " {" NL,
210 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
211 "% n]};" NL,
212 " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
213 " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
214 "" NL,
215 " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
216 " int enq_res = enqueue_kernel(get_default_queue(), "
217 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
218 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
219 " }" NL,
220 " }" NL,
221 " }" NL,
222 " }" NL,
223 "}" NL
224 };
225
226
227 static const char *helper_ndrange_3d_glo[] = {
228 NL,
229 "void block_fn(int len, __global atomic_uint* val)" NL,
230 "{" NL,
231 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
232 "memory_order_relaxed, memory_scope_device);" NL,
233 "}" NL,
234 "" NL,
235 "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
236 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
237 "val, __global uint* ofs_arr)" NL,
238 "{" NL,
239 " size_t tid = get_global_id(0);" NL,
240 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
241 "" NL,
242 " for(int i = 0; i < n; i++)" NL,
243 " {" NL,
244 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
245 "n] * glob_size_arr[(i + 2) % n];" NL,
246 " if (global_work_size <= (len * len))" NL,
247 " {" NL,
248 " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
249 "n], glob_size_arr[(i + 2) % n] };" NL,
250 " ndrange_t ndrange = ndrange_3D(glob_size);" NL,
251 " int enq_res = enqueue_kernel(get_default_queue(), "
252 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
253 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
254 " }" NL,
255 " }" NL,
256 "}" NL
257 };
258
259
260 static const char *helper_ndrange_3d_loc[] = {
261 NL,
262 "void block_fn(int len, __global atomic_uint* val)" NL,
263 "{" NL,
264 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
265 "memory_order_relaxed, memory_scope_device);" NL,
266 "}" NL,
267 "" NL,
268 "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
269 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
270 "val, __global uint* ofs_arr)" NL,
271 "{" NL,
272 " size_t tid = get_global_id(0);" NL,
273 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
274 "" NL,
275 " for(int k = 0; k < n; k++)" NL,
276 " {" NL,
277 " for(int i = 0; i < n; i++)" NL,
278 " {" NL,
279 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
280 "n] * glob_size_arr[(i + 2) % n];" NL,
281 " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
282 "global_work_size <= (len * len))" NL,
283 " {" NL,
284 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
285 "n], glob_size_arr[(i + 2) % n] };" NL,
286 " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
287 " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
288 " int enq_res = enqueue_kernel(get_default_queue(), "
289 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
290 " " NL,
291 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
292 " }" NL,
293 " }" NL,
294 " }" NL,
295 "}" NL
296 };
297
298 static const char *helper_ndrange_3d_ofs[] = {
299 NL,
300 "void block_fn(int len, __global atomic_uint* val)" NL,
301 "{" NL,
302 " atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
303 "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
304 "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
305 "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
306 "}" NL,
307 "" NL,
308 "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
309 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
310 "val, __global uint* ofs_arr)" NL,
311 "{" NL,
312 " size_t tid = get_global_id(0);" NL,
313 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
314 "" NL,
315 " for(int l = 0; l < n; l++)" NL,
316 " {" NL,
317 " for(int k = 0; k < n; k++)" NL,
318 " {" NL,
319 " for(int i = 0; i < n; i++)" NL,
320 " {" NL,
321 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) "
322 "% n] * glob_size_arr[(i + 2) % n];" NL,
323 " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
324 "global_work_size <= (len * len))" NL,
325 " {" NL,
326 " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
327 "% n], glob_size_arr[(i + 2) % n]};" NL,
328 " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
329 " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
330 "+ 2) % n] };" NL,
331 " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
332 " int enq_res = enqueue_kernel(get_default_queue(), "
333 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
334 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
335 " }" NL,
336 " }" NL,
337 " }" NL,
338 " }" NL,
339 "}" NL
340 };
341
342 static const kernel_src_dim_check sources_ndrange_Xd[] =
343 {
344 { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
345 { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
346 { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
347 { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
348 { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
349 { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
350 { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
351 { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
352 { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
353 };
354 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
355
check_kernel_results(cl_int * results,cl_int len)356 static int check_kernel_results(cl_int* results, cl_int len)
357 {
358 for(cl_int i = 0; i < len; ++i)
359 {
360 if(results[i] != 0) return i;
361 }
362 return -1;
363 }
364
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)365 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
366 {
367 for (size_t g = 0; g < glob_size_arr.size(); ++g)
368 {
369 for (size_t w = 0; w < glob_size_arr[g]; ++w)
370 {
371 ++reference_results[w];
372 }
373 }
374 }
375
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)376 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
377 {
378 for (size_t g = 0; g < glob_size_arr.size(); ++g)
379 {
380 for (size_t l = 0; l < loc_size_arr.size(); ++l)
381 {
382 if (glob_size_arr[g] >= loc_size_arr[l])
383 {
384 for (size_t w = 0; w < glob_size_arr[g]; ++w)
385 {
386 ++reference_results[w];
387 }
388 }
389 }
390 }
391 }
392
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)393 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
394 {
395 for (size_t g = 0; g < glob_size_arr.size(); ++g)
396 {
397 for (size_t l = 0; l < loc_size_arr.size(); ++l)
398 {
399 if (glob_size_arr[g] >= loc_size_arr[l])
400 {
401 for (size_t o = 0; o < offset.size(); ++o)
402 {
403 for (size_t w = 0; w < glob_size_arr[g]; ++w)
404 {
405 ++reference_results[(offset[o] + w) % len];
406 }
407 }
408 }
409 }
410 }
411 }
412
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)413 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
414 {
415 for (size_t g = 0; g < glob_size_arr.size(); ++g)
416 {
417 for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
418 {
419 for (size_t w = 0; w < glob_size_arr[g]; ++w)
420 {
421 ++reference_results[(h * glob_size_arr[g] + w) % len];
422 }
423 }
424 }
425 }
426
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)427 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
428 {
429 size_t n = glob_size_arr.size();
430 for (size_t g = 0; g < glob_size_arr.size(); ++g)
431 {
432 for (size_t l = 0; l < loc_size_arr.size(); ++l)
433 {
434 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
435 {
436 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
437 {
438 for (size_t w = 0; w < glob_size_arr[g]; ++w)
439 {
440 ++reference_results[(h * glob_size_arr[g] + w) % len];
441 }
442 }
443 }
444 }
445 }
446 }
447
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)448 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
449 {
450 size_t n = glob_size_arr.size();
451 for (size_t g = 0; g < glob_size_arr.size(); ++g)
452 {
453 for (size_t l = 0; l < loc_size_arr.size(); ++l)
454 {
455 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
456 {
457 for (size_t o = 0; o < offset.size(); ++o)
458 {
459 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
460 {
461 for (size_t w = 0; w < glob_size_arr[g]; ++w)
462 {
463 ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
464 }
465 }
466 }
467 }
468 }
469 }
470 }
471
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)472 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
473 {
474 size_t n = glob_size_arr.size();
475 for (size_t g = 0; g < glob_size_arr.size(); ++g)
476 {
477 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
478 if(global_work_size <= (len * len))
479 {
480 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
481 {
482 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
483 {
484 for (size_t w = 0; w < glob_size_arr[g]; ++w)
485 {
486 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
487 }
488 }
489 }
490 }
491 }
492 }
493
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)494 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
495 {
496 size_t n = glob_size_arr.size();
497 for (size_t g = 0; g < glob_size_arr.size(); ++g)
498 {
499 for (size_t l = 0; l < loc_size_arr.size(); ++l)
500 {
501 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
502 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
503 {
504 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
505 {
506 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
507 {
508 for (size_t w = 0; w < glob_size_arr[g]; ++w)
509 {
510 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
511 }
512 }
513 }
514 }
515 }
516 }
517 }
518
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)519 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
520 {
521 size_t n = glob_size_arr.size();
522 for (size_t g = 0; g < glob_size_arr.size(); ++g)
523 {
524 for (size_t l = 0; l < loc_size_arr.size(); ++l)
525 {
526 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
527 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
528 {
529 for (size_t o = 0; o < offset.size(); ++o)
530 {
531 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
532 {
533 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
534 {
535 for (size_t w = 0; w < glob_size_arr[g]; ++w)
536 {
537 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
538 }
539 }
540 }
541 }
542 }
543 }
544 }
545 }
546
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)547 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
548 {
549 std::vector<cl_int> reference_results(len, 0);
550 switch (dim)
551 {
552 case 1:
553 if (use_local == CL_FALSE)
554 {
555 generate_reference_1D(reference_results, glob_size_arr);
556 }
557 else if(use_local == CL_TRUE && use_offset == CL_FALSE)
558 {
559 generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
560 }
561 else
562 {
563 generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
564 }
565 break;
566 case 2:
567 if (use_local == CL_FALSE)
568 {
569 generate_reference_2D(reference_results, glob_size_arr, len);
570 }
571 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
572 {
573 generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
574 }
575 else
576 {
577 generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
578 }
579 break;
580 case 3:
581 if (use_local == CL_FALSE)
582 {
583 generate_reference_3D(reference_results, glob_size_arr, len);
584 }
585 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
586 {
587 generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
588 }
589 else
590 {
591 generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
592 }
593 break;
594 default:
595 return 0;
596 break;
597 }
598
599 for (cl_int i = 0; i < len; ++i)
600 {
601 if (results[i] != reference_results[i])
602 {
603 log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
604 return i;
605 }
606 }
607
608 return -1;
609 }
610
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)611 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
612 {
613 MTdata d;
614 cl_uint i;
615 cl_int err_ret, res = 0;
616 clCommandQueueWrapper dev_queue;
617 cl_int k, kernel_results[MAX_GWS] = { 0 };
618
619 size_t ret_len;
620 cl_uint max_queues = 1;
621 cl_uint maxQueueSize = 0;
622
623 d = init_genrand(gRandomSeed);
624
625 err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
626 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
627
628 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
629 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
630
631 size_t max_local_size = 1;
632 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
633 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
634
635 cl_queue_properties queue_prop_def[] =
636 {
637 CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
638 CL_QUEUE_SIZE, maxQueueSize,
639 0
640 };
641
642 dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
643 test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
644
645 max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
646 if(gWimpyMode)
647 {
648 max_local_size = MIN(8, max_local_size);
649 }
650
651 cl_uint num = 10;
652 cl_uint global_work_size = max_local_size * 2;
653 std::vector<cl_uint> glob_size_arr(num);
654 std::vector<cl_uint> loc_size_arr(num);
655 std::vector<cl_uint> ofs_arr(num);
656 std::vector<cl_int> glob_results(global_work_size, 0);
657
658 glob_size_arr[0] = 1;
659 glob_size_arr[1] = global_work_size;
660 loc_size_arr[0] = 1;
661 loc_size_arr[1] = max_local_size;
662 ofs_arr[0] = 0;
663 ofs_arr[1] = 1;
664
665 for(i = 2; i < num; ++i)
666 {
667 glob_size_arr[i] = genrand_int32(d) % global_work_size;
668 glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
669 loc_size_arr[i] = genrand_int32(d) % max_local_size;
670 loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
671 ofs_arr[i] = genrand_int32(d) % global_work_size;
672 }
673
674 // check ndrange_dX functions
675 size_t failCnt = 0;
676 for(i = 0; i < num_kernels_ndrange_Xd; ++i)
677 {
678 if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
679 continue;
680
681 clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
682 test_error(err_ret, "clCreateBuffer() failed");
683 clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
684 test_error(err_ret, "clCreateBuffer() failed");
685 clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
686 test_error(err_ret, "clCreateBuffer() failed");
687 clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
688 test_error(err_ret, "clCreateBuffer() failed");
689
690 kernel_arg args[] =
691 {
692 { sizeof(cl_uint), &num },
693 { sizeof(cl_uint), &global_work_size },
694 { sizeof(cl_mem), &mem1 },
695 { sizeof(cl_mem), &mem2 },
696 { sizeof(cl_mem), &mem3 },
697 { sizeof(cl_mem), &mem4 },
698 };
699
700 log_info("Running '%s' kernel (%d of %d) ...\n", sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
701 err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
702
703 cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
704 test_error(err_ret, "clEnqueueMapBuffer() failed");
705
706 if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
707 else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
708 else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
709 else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
710
711 err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
712 test_error(err_ret, "clEnqueueUnmapMemObject() failed");
713
714 }
715
716 if (failCnt > 0)
717 {
718 log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
719 }
720
721 return res;
722 }
723
724
725 #endif
726
727