1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "TestNonUniformWorkGroup.h"
17 #include <vector>
18 #include <sstream>
19 #define NL "\n"
20 
21 size_t TestNonUniformWorkGroup::_maxLocalWorkgroupSize = 0;
22 bool TestNonUniformWorkGroup::_strictMode = false;
23 
24 // Main Kernel source code
25 static const char *KERNEL_FUNCTION =
26   NL "#define MAX_DIMS 3"
27   NL "typedef struct _DataContainerAttrib"
28   NL "{"
29   NL "    unsigned long get_global_size[MAX_DIMS];"
30   NL "    unsigned long get_global_offset[MAX_DIMS];"
31   NL "    unsigned long get_local_size[MAX_DIMS];"
32   NL "    unsigned long get_enqueued_local_size[MAX_DIMS];"
33   NL "    unsigned long get_global_id[MAX_DIMS];"
34   NL "    unsigned long get_local_id[MAX_DIMS];"
35   NL "    unsigned long get_group_id[MAX_DIMS];"
36   NL "    unsigned long get_num_groups[MAX_DIMS];"
37   NL "    unsigned long get_work_dim;"
38   NL "    unsigned short test_local_barrier_result_bool;"
39   NL "    unsigned short test_global_barrier_result_bool;"
40   NL "    unsigned short test_local_atomic_result_value;"
41   NL "}DataContainerAttrib;"
42 
43   NL "enum Error{"
44   NL "  ERR_GLOBAL_SIZE=0,"
45   NL "  ERR_GLOBAL_WORK_OFFSET,"
46   NL "  ERR_LOCAL_SIZE,"
47   NL "  ERR_GLOBAL_ID,"
48   NL "  ERR_LOCAL_ID,"
49   NL "  ERR_ENQUEUED_LOCAL_SIZE,"
50   NL "  ERR_NUM_GROUPS,"
51   NL "  ERR_GROUP_ID,"
52   NL "  ERR_WORK_DIM,"
53   NL "  ERR_GLOBAL_BARRIER,"
54   NL "  ERR_LOCAL_BARRIER,"
55   NL "  ERR_GLOBAL_ATOMIC,"
56   NL "  ERR_LOCAL_ATOMIC,"
57   NL "  ERR_STRICT_MODE,"
58   NL "  ERR_BUILD_STATUS,"
59   NL "  ERR_UNKNOWN,"
60   NL "  ERR_DIFFERENT,"
61   NL "  _LAST_ELEM"
62   NL "};"
63 
64   NL "uint getGlobalIndex (uint gid2, uint gid1, uint gid0) {"
65   NL "    return gid2*get_global_size(0)*get_global_size(1) + gid1*get_global_size(0) + gid0;"
66   NL "}"
67 
68   NL "int getRegionIndex () {"
69   NL "    uint gid0 = get_global_id(0) - get_global_offset(0);"
70   NL "    uint gid1 = get_global_id(1) - get_global_offset(1);"
71   NL "    uint gid2 = get_global_id(2) - get_global_offset(2);"
72   NL "    if (gid0 == 0 && gid1 == 0 && gid2 == 0) {"
73   NL "      return 0;"
74   NL "    } else if (gid0 == get_global_size(0) - 1 && gid1 == 0 && gid2 == 0) {"
75   NL "      return 1;"
76   NL "    } else if (gid0 == 0 && gid1 == get_global_size(1) - 1 && gid2 == 0) {"
77   NL "      return 2;"
78   NL "    } else if (gid0 == get_global_size(0) - 1 && gid1 == get_global_size(1) - 1 && gid2 == 0) {"
79   NL "      return 3;"
80   NL "    } else if (gid0 == 0 && gid1 == 0 && gid2 == get_global_size(2) - 1) {"
81   NL "      return 4;"
82   NL "    } else if (gid0 == get_global_size(0) - 1 && gid1 == 0 && gid2 == get_global_size(2) - 1) {"
83   NL "      return 5;"
84   NL "    } else if (gid0 == 0 && gid1 == get_global_size(1) - 1 && gid2 == get_global_size(2) - 1) {"
85   NL "      return 6;"
86   NL "    } else if (gid0 == get_global_size(0) - 1 && gid1 == get_global_size(1) - 1 && gid2 == get_global_size(2) - 1) {"
87   NL "      return 7;"
88   NL "    }"
89   NL "    return -1;"
90   NL "}"
91 
92   NL "void getLocalSize(__global DataContainerAttrib *results) {"
93   NL "  for (unsigned short i = 0; i < MAX_DIMS; i++) {"
94   NL "    results->get_local_size[i] = get_local_size(i);"
95   NL "  }"
96   NL "}"
97 
98   NL "#ifdef TESTBASIC"
99   // values set by this function will be checked on the host side
100   NL "void testBasicHost(__global DataContainerAttrib *results) {"
101   NL "    for (unsigned short i = 0; i < MAX_DIMS; i++) {"
102   NL "      results->get_global_size[i] = get_global_size(i);"
103   NL "      results->get_global_offset[i] = get_global_offset(i);"
104   NL "      results->get_enqueued_local_size[i] = get_enqueued_local_size(i);"
105   NL "      results->get_global_id[i] = get_global_id(i);"
106   NL "      results->get_local_id[i] = get_local_id(i);"
107   NL "      results->get_group_id[i] = get_group_id(i);"
108   NL "      results->get_num_groups[i] = get_num_groups(i);"
109   NL "    }"
110   NL "    results->get_work_dim = get_work_dim();"
111   NL "}"
112   // values set by this function are checked on the kernel side
113   NL "void testBasicKernel(__global unsigned int *errorCounterBuffer, __local DataContainerAttrib *resultsForThread0) {"
114   NL "  uint lid0 = get_local_id(0);"
115   NL "  uint lid1 = get_local_id(1);"
116   NL "  uint lid2 = get_local_id(2);"
117   NL "  if (lid0 == 0 && lid1 == 0 && lid2 == 0) {"
118   NL "    for (unsigned short i = 0; i < MAX_DIMS; i++) {"
119   NL "      resultsForThread0->get_global_size[i] = get_global_size(i);"
120   NL "      resultsForThread0->get_global_offset[i] = get_global_offset(i);"
121   NL "      resultsForThread0->get_enqueued_local_size[i] = get_enqueued_local_size(i);"
122   NL "      resultsForThread0->get_group_id[i] = get_group_id(i);"
123   NL "      resultsForThread0->get_num_groups[i] = get_num_groups(i);"
124   NL "    }"
125   NL "    resultsForThread0->get_work_dim = get_work_dim();"
126   NL "  }"
127   NL "    barrier(CLK_LOCAL_MEM_FENCE);"
128   // verifies built in functions on the kernel side
129   NL "  if (lid0 != 0 || lid1 != 0 || lid2 != 0) {"
130   NL "    for (unsigned short i = 0; i < MAX_DIMS; i++) {"
131   NL "      if (resultsForThread0->get_global_size[i] != get_global_size(i)) {"
132   NL "        atomic_inc(&errorCounterBuffer[ERR_GLOBAL_SIZE]);"
133   NL "      }"
134   NL "      if (resultsForThread0->get_global_offset[i] != get_global_offset(i)) {"
135   NL "        atomic_inc(&errorCounterBuffer[ERR_GLOBAL_WORK_OFFSET]);"
136   NL "      }"
137   NL "      if (resultsForThread0->get_enqueued_local_size[i] != get_enqueued_local_size(i)) {"
138   NL "        atomic_inc(&errorCounterBuffer[ERR_ENQUEUED_LOCAL_SIZE]);"
139   NL "      }"
140   NL "      if (resultsForThread0->get_group_id[i] != get_group_id(i)) {"
141   NL "        atomic_inc(&errorCounterBuffer[ERR_GROUP_ID]);"
142   NL "      }"
143   NL "      if (resultsForThread0->get_num_groups[i] != get_num_groups(i)) {"
144   NL "        atomic_inc(&errorCounterBuffer[ERR_NUM_GROUPS]);"
145   NL "      }"
146   NL "    }"
147   NL "    if (resultsForThread0->get_work_dim != get_work_dim()) {"
148   NL "      atomic_inc(&errorCounterBuffer[ERR_WORK_DIM]);"
149   NL "    }"
150   NL "  }"
151   NL "}"
152   NL "#endif"
153 
154   NL "#ifdef TESTBARRIERS"
155   NL "void testBarriers(__global unsigned int *errorCounterBuffer, __local unsigned int *testLocalBuffer, __global unsigned int *testGlobalBuffer) {"
156   NL "    uint gid0 = get_global_id(0);"
157   NL "    uint gid1 = get_global_id(1);"
158   NL "    uint gid2 = get_global_id(2);"
159   NL "    uint lid0 = get_local_id(0);"
160   NL "    uint lid1 = get_local_id(1);"
161   NL "    uint lid2 = get_local_id(2);"
162   NL
163   NL "    uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
164   NL "    uint localIndex = lid2*get_local_size(0)*get_local_size(1) + lid1*get_local_size(0) + lid0;"
165   NL "    testLocalBuffer[localIndex] = 0;"
166   NL "    testGlobalBuffer[globalIndex] = 0;"
167   NL "    uint maxLocalIndex = get_local_size(0)*get_local_size(1)*get_local_size(2)-1;"
168   NL "    uint nextLocalIndex = (localIndex>=maxLocalIndex)?0:(localIndex+1);"
169   NL "    uint next_lid0 = (lid0+1>=get_local_size(0))?0:lid0+1;"
170   NL "    uint next_lid1 = (lid1+1>=get_local_size(1))?0:lid1+1;"
171   NL "    uint next_lid2 = (lid2+1>=get_local_size(2))?0:lid2+1;"
172   NL "    uint nextGlobalIndexInLocalWorkGroup = getGlobalIndex (get_group_id(2)*get_enqueued_local_size(2)+next_lid2, get_group_id(1)*get_enqueued_local_size(1)+next_lid1, get_group_id(0)*get_enqueued_local_size(0)+next_lid0);"
173   // testing local barriers
174   NL "    testLocalBuffer[localIndex] = localIndex;"
175   NL "    barrier(CLK_LOCAL_MEM_FENCE);"
176   NL "    uint temp = testLocalBuffer[nextLocalIndex];"
177   NL "    if (temp != nextLocalIndex) {"
178   NL "      atomic_inc(&errorCounterBuffer[ERR_LOCAL_BARRIER]);"
179   NL "    }"
180   // testing global barriers
181   NL "    testGlobalBuffer[globalIndex] = globalIndex;"
182   NL "    barrier(CLK_GLOBAL_MEM_FENCE);"
183   NL "    uint temp2 = testGlobalBuffer[nextGlobalIndexInLocalWorkGroup];"
184   NL "    if (temp2 != nextGlobalIndexInLocalWorkGroup) {"
185   NL "      atomic_inc(&errorCounterBuffer[ERR_GLOBAL_BARRIER]);"
186   NL "    }"
187   NL "}"
188   NL "#endif"
189 
190   NL "#ifdef TESTATOMICS"
191   NL "void testAtomics(__global unsigned int *globalAtomicTestVariable, __local unsigned int *localAtomicTestVariable) {"
192   NL "    uint gid0 = get_global_id(0);"
193   NL "    uint gid1 = get_global_id(1);"
194   NL "    uint gid2 = get_global_id(2);"
195   NL
196   NL "    uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
197   // testing atomic function on local memory
198   NL "    atomic_inc(localAtomicTestVariable);"
199   NL "    barrier(CLK_LOCAL_MEM_FENCE);"
200   // testing atomic function on global memory
201   NL "    atomic_inc(globalAtomicTestVariable);"
202   NL "}"
203   NL "#endif"
204 
205   NL "#ifdef RWGSX"
206   NL "#ifdef RWGSY"
207   NL "#ifdef RWGSZ"
208   NL "__attribute__((reqd_work_group_size(RWGSX, RWGSY, RWGSZ)))"
209   NL "#endif"
210   NL "#endif"
211   NL "#endif"
212   NL "__kernel void testKernel(__global DataContainerAttrib *results, __local unsigned int *testLocalBuffer,"
213   NL "      __global unsigned int *testGlobalBuffer, __global unsigned int *globalAtomicTestVariable, __global unsigned int *errorCounterBuffer) {"
214   NL "    uint gid0 = get_global_id(0);"
215   NL "    uint gid1 = get_global_id(1);"
216   NL "    uint gid2 = get_global_id(2);"
217   NL
218   NL "    uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
219   NL "    int regionIndex = getRegionIndex();"
220   NL "    if (regionIndex >= 0) {"
221   NL "      getLocalSize(&results[regionIndex]);"
222   NL "    }"
223   NL "#ifdef TESTBASIC"
224   NL "    if (regionIndex >= 0) {"
225   NL "      testBasicHost(&results[regionIndex]);"
226   NL "    }"
227   NL "    __local DataContainerAttrib resultsForThread0;"
228   NL "    testBasicKernel(errorCounterBuffer, &resultsForThread0);"
229   NL "#endif"
230   NL "#ifdef TESTBARRIERS"
231   NL "    testBarriers(errorCounterBuffer, testLocalBuffer, testGlobalBuffer);"
232   NL "#endif"
233   NL "#ifdef TESTATOMICS"
234   NL "    __local unsigned int localAtomicTestVariable;"
235   NL "    localAtomicTestVariable = 0;"
236   NL "    barrier(CLK_LOCAL_MEM_FENCE);"
237   NL "    testAtomics(globalAtomicTestVariable, &localAtomicTestVariable);"
238   NL "    barrier(CLK_LOCAL_MEM_FENCE);"
239   NL "    if (localAtomicTestVariable != get_local_size(0) * get_local_size(1) * get_local_size(2)) {"
240   NL "      atomic_inc(&errorCounterBuffer[ERR_LOCAL_ATOMIC]);"
241   NL "    }"
242   NL "#endif"
243   NL "}"
244   NL ;
245 
TestNonUniformWorkGroup(const cl_device_id & device,const cl_context & context,const cl_command_queue & queue,const cl_uint dims,size_t * globalSize,const size_t * localSize,const size_t * buffersSize,const size_t * globalWorkOffset,const size_t * reqdWorkGroupSize)246 TestNonUniformWorkGroup::TestNonUniformWorkGroup(
247     const cl_device_id &device, const cl_context &context,
248     const cl_command_queue &queue, const cl_uint dims, size_t *globalSize,
249     const size_t *localSize, const size_t *buffersSize,
250     const size_t *globalWorkOffset, const size_t *reqdWorkGroupSize)
251     : _device(device), _context(context), _queue(queue), _dims(dims)
252 {
253 
254     if (globalSize == NULL || dims < 1 || dims > 3)
255     {
256         // throw std::invalid_argument("globalSize is NULL value.");
257         // This is method of informing that parameters are wrong.
258         // It would be checked by prepareDevice() function.
259         // This is used because of lack of exception support.
260         _globalSize[0] = 0;
261         return;
262     }
263 
264     // For OpenCL-3.0 support for non-uniform workgroups is optional, it's still
265     // useful to run these tests since we can verify the behavior of the
266     // get_enqueued_local_size() builtin for uniform workgroups, so we round up
267     // the global size to insure uniform workgroups on those 3.0 devices.
268     // We only need to do this when localSize is non-null, otherwise the driver
269     // will select a value for localSize which will be uniform on devices that
270     // don't support non-uniform work-groups.
271     if (nullptr != localSize && get_device_cl_version(device) >= Version(3, 0))
272     {
273         // Query for the non-uniform work-group support.
274         cl_bool are_non_uniform_sub_groups_supported{ CL_FALSE };
275         auto error =
276             clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT,
277                             sizeof(are_non_uniform_sub_groups_supported),
278                             &are_non_uniform_sub_groups_supported, nullptr);
279         if (error)
280         {
281             print_error(error,
282                         "clGetDeviceInfo failed for "
283                         "CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT");
284             // This signals an error to the caller (see above).
285             _globalSize[0] = 0;
286             return;
287         }
288 
289         // If non-uniform work-groups are not supported round up the global
290         // sizes so workgroups are uniform and we have at least one.
291         if (CL_FALSE == are_non_uniform_sub_groups_supported)
292         {
293             log_info(
294                 "WARNING: Non-uniform work-groups are not supported on this "
295                 "device.\n Running test with uniform work-groups.\n");
296             for (unsigned dim = 0; dim < dims; ++dim)
297             {
298                 auto global_size_before = globalSize[dim];
299                 auto global_size_rounded = global_size_before
300                     + (localSize[dim] - global_size_before % localSize[dim]);
301                 globalSize[dim] = global_size_rounded;
302                 log_info("Rounding globalSize[%d] = %d -> %d\n", dim,
303                          global_size_before, global_size_rounded);
304             }
305         }
306     }
307 
308     cl_uint i;
309     _globalWorkOffset_IsNull = true;
310     _localSize_IsNull = true;
311 
312     setGlobalWorkgroupSize(globalSize);
313     setLocalWorkgroupSize(globalSize, localSize);
314     for (i = _dims; i < MAX_DIMS; i++)
315     {
316         _globalSize[i] = 1;
317     }
318 
319     for (i = 0; i < MAX_DIMS; i++)
320     {
321         _globalWorkOffset[i] = 0;
322     }
323 
324     if (globalWorkOffset)
325     {
326         _globalWorkOffset_IsNull = false;
327         for (i = 0; i < _dims; i++)
328         {
329             _globalWorkOffset[i] = globalWorkOffset[i];
330         }
331     }
332 
333     for (i = 0; i < MAX_DIMS; i++)
334     {
335         _enqueuedLocalSize[i] = 1;
336     }
337 
338     if (localSize)
339     {
340         _localSize_IsNull = false;
341         for (i = 0; i < _dims; i++)
342         {
343             _enqueuedLocalSize[i] = _localSize[i];
344         }
345     }
346 
347     if (reqdWorkGroupSize)
348     {
349         for (i = 0; i < _dims; i++)
350         {
351             _reqdWorkGroupSize[i] = reqdWorkGroupSize[i];
352         }
353         for (i = _dims; i < MAX_DIMS; i++)
354         {
355             _reqdWorkGroupSize[i] = 1;
356         }
357     }
358     else
359     {
360         _reqdWorkGroupSize[0] = 0;
361         _reqdWorkGroupSize[1] = 0;
362         _reqdWorkGroupSize[2] = 0;
363     }
364 
365     _testRange = Range::ALL;
366 
367     _numOfGlobalWorkItems = _globalSize[0] * _globalSize[1] * _globalSize[2];
368 
369     DataContainerAttrib temp = { { 0, 0, 0 } };
370 
371     // array with results from each region
372     _resultsRegionArray.resize(NUMBER_OF_REGIONS, temp);
373     _referenceRegionArray.resize(NUMBER_OF_REGIONS, temp);
374 }
375 
~TestNonUniformWorkGroup()376 TestNonUniformWorkGroup::~TestNonUniformWorkGroup () {
377   if (_err.checkError()) {
378     _err.showStats();
379   }
380 }
381 
setLocalWorkgroupSize(const size_t * globalSize,const size_t * localSize)382 void TestNonUniformWorkGroup::setLocalWorkgroupSize (const size_t *globalSize, const size_t *localSize)
383 {
384    cl_uint i;
385    // Enforce localSize should not exceed globalSize
386    if (localSize) {
387        for (i = 0; i < _dims; i++) {
388            if ((globalSize[i] < localSize[i])) {
389                _localSize[i] = globalSize[i];
390            }else{
391                _localSize[i] = localSize[i];
392            }
393       }
394    }
395 }
396 
setGlobalWorkgroupSize(const size_t * globalSize)397 void TestNonUniformWorkGroup::setGlobalWorkgroupSize (const size_t *globalSize)
398 {
399    cl_uint i;
400    for (i = 0; i < _dims; i++) {
401        _globalSize[i] = globalSize[i];
402    }
403 }
404 
verifyData(DataContainerAttrib * reference,DataContainerAttrib * results,short regionNumber)405 void TestNonUniformWorkGroup::verifyData (DataContainerAttrib * reference, DataContainerAttrib * results, short regionNumber) {
406 
407   std::ostringstream tmp;
408   std::string errorLocation;
409 
410   if (_testRange & Range::BASIC) {
411     for (unsigned short i = 0; i < MAX_DIMS; i++) {
412       tmp.str("");
413       tmp.clear();
414       tmp << "region number: " << regionNumber << " for dim: " << i;
415       errorLocation = tmp.str();
416 
417       if (results->get_global_size[i] != reference->get_global_size[i]) {
418         _err.show(Error::ERR_GLOBAL_SIZE, errorLocation, results->get_global_size[i], reference->get_global_size[i]);
419       }
420 
421       if (results->get_global_offset[i] != reference->get_global_offset[i]) {
422         _err.show(Error::ERR_GLOBAL_WORK_OFFSET, errorLocation, results->get_global_offset[i], reference->get_global_offset[i]);
423       }
424 
425       if (results->get_local_size[i] != reference->get_local_size[i] || results->get_local_size[i] > _maxWorkItemSizes[i]) {
426         _err.show(Error::ERR_LOCAL_SIZE, errorLocation, results->get_local_size[i], reference->get_local_size[i]);
427       }
428 
429       if (results->get_enqueued_local_size[i] != reference->get_enqueued_local_size[i] || results->get_enqueued_local_size[i] > _maxWorkItemSizes[i]) {
430         _err.show(Error::ERR_ENQUEUED_LOCAL_SIZE, errorLocation, results->get_enqueued_local_size[i], reference->get_enqueued_local_size[i]);
431       }
432 
433       if (results->get_num_groups[i] != reference->get_num_groups[i]) {
434         _err.show(Error::ERR_NUM_GROUPS, errorLocation, results->get_num_groups[i], reference->get_num_groups[i]);
435       }
436     }
437   }
438 
439   tmp.str("");
440   tmp.clear();
441   tmp << "region number: " << regionNumber;
442   errorLocation = tmp.str();
443   if (_testRange & Range::BASIC) {
444     if (results->get_work_dim != reference->get_work_dim) {
445       _err.show(Error::ERR_WORK_DIM, errorLocation, results->get_work_dim, reference->get_work_dim);
446     }
447   }
448 }
449 
calculateExpectedValues()450 void TestNonUniformWorkGroup::calculateExpectedValues () {
451   size_t nonRemainderGlobalSize[MAX_DIMS];
452   size_t numberOfPossibleRegions[MAX_DIMS];
453 
454   nonRemainderGlobalSize[0] = _globalSize[0] - (_globalSize[0] % _enqueuedLocalSize[0]);
455   nonRemainderGlobalSize[1] = _globalSize[1] - (_globalSize[1] % _enqueuedLocalSize[1]);
456   nonRemainderGlobalSize[2] = _globalSize[2] - (_globalSize[2] % _enqueuedLocalSize[2]);
457 
458   numberOfPossibleRegions[0] = (_globalSize[0]>1)?2:1;
459   numberOfPossibleRegions[1] = (_globalSize[1]>1)?2:1;
460   numberOfPossibleRegions[2] = (_globalSize[2]>1)?2:1;
461 
462   for (cl_ushort i = 0; i < NUMBER_OF_REGIONS; ++i) {
463 
464     if (i & 0x01 && numberOfPossibleRegions[0] == 1) {
465       continue;
466     }
467 
468     if (i & 0x02 && numberOfPossibleRegions[1] == 1) {
469       continue;
470     }
471 
472     if (i & 0x04 && numberOfPossibleRegions[2] == 1) {
473       continue;
474     }
475 
476     for (cl_ushort dim = 0; dim < MAX_DIMS; ++dim) {
477       _referenceRegionArray[i].get_global_size[dim] = static_cast<unsigned long>(_globalSize[dim]);
478       _referenceRegionArray[i].get_global_offset[dim] = static_cast<unsigned long>(_globalWorkOffset[dim]);
479       _referenceRegionArray[i].get_enqueued_local_size[dim] = static_cast<unsigned long>(_enqueuedLocalSize[dim]);
480       _referenceRegionArray[i].get_local_size[dim] = static_cast<unsigned long>(_enqueuedLocalSize[dim]);
481       _referenceRegionArray[i].get_num_groups[dim] = static_cast<unsigned long>(ceil(static_cast<float>(_globalSize[dim]) / _enqueuedLocalSize[dim]));
482     }
483     _referenceRegionArray[i].get_work_dim = _dims;
484 
485     if (i & 0x01) {
486       _referenceRegionArray[i].get_local_size[0] = static_cast<unsigned long>((_globalSize[0] - 1) % _enqueuedLocalSize[0] + 1);
487     }
488 
489     if (i & 0x02) {
490       _referenceRegionArray[i].get_local_size[1] = static_cast<unsigned long>((_globalSize[1] - 1) % _enqueuedLocalSize[1] + 1);
491     }
492 
493     if (i & 0x04) {
494       _referenceRegionArray[i].get_local_size[2] = static_cast<unsigned long>((_globalSize[2] - 1) % _enqueuedLocalSize[2] + 1);
495     }
496   }
497 }
498 
getMaxLocalWorkgroupSize(const cl_device_id & device)499 size_t TestNonUniformWorkGroup::getMaxLocalWorkgroupSize (const cl_device_id &device) {
500   int err;
501 
502   if (TestNonUniformWorkGroup::_maxLocalWorkgroupSize == 0) {
503     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
504       sizeof(TestNonUniformWorkGroup::_maxLocalWorkgroupSize), &TestNonUniformWorkGroup::_maxLocalWorkgroupSize, NULL);
505   }
506 
507   return TestNonUniformWorkGroup::_maxLocalWorkgroupSize;
508 }
509 
enableStrictMode(bool state)510 void TestNonUniformWorkGroup::enableStrictMode(bool state) {
511   TestNonUniformWorkGroup::_strictMode = state;
512 }
513 
prepareDevice()514 int TestNonUniformWorkGroup::prepareDevice () {
515   int err;
516   cl_uint device_max_dimensions;
517   cl_uint i;
518 
519   if (_globalSize[0] == 0)
520   {
521     log_error("Some arguments passed into constructor were wrong.\n");
522     return -1;
523   }
524 
525   err = clGetDeviceInfo(_device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
526     sizeof(device_max_dimensions), &device_max_dimensions, NULL);
527   test_error(err, "clGetDeviceInfo failed");
528 
529   err = clGetDeviceInfo(_device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
530     sizeof(_maxWorkItemSizes), _maxWorkItemSizes, NULL);
531 
532   test_error(err, "clGetDeviceInfo failed");
533 
534   // Trim the local size to the limitations of what the device supports in each dimension.
535   for (i = 0; i < _dims; i++) {
536     if(_enqueuedLocalSize[i] > _maxWorkItemSizes[i]) {
537       _enqueuedLocalSize[i] = _maxWorkItemSizes[i];
538     }
539   }
540 
541   if(_localSize_IsNull == false)
542     calculateExpectedValues();
543 
544   std::string buildOptions{};
545   if(_reqdWorkGroupSize[0] != 0 && _reqdWorkGroupSize[1] != 0 && _reqdWorkGroupSize[2] != 0) {
546     std::ostringstream tmp(" ");
547     tmp << " -D RWGSX=" << _reqdWorkGroupSize[0]
548       << " -D RWGSY=" << _reqdWorkGroupSize[1]
549       << " -D RWGSZ=" << _reqdWorkGroupSize[2] << " ";
550       buildOptions += tmp.str();
551   }
552 
553   if (_testRange & Range::BASIC)
554     buildOptions += " -D TESTBASIC";
555   if (_testRange & Range::ATOMICS)
556     buildOptions += " -D TESTATOMICS";
557   if (_testRange & Range::BARRIERS)
558     buildOptions += " -D TESTBARRIERS";
559 
560   err = create_single_kernel_helper_with_build_options (_context, &_program, &_testKernel, 1,
561     &KERNEL_FUNCTION, "testKernel", buildOptions.c_str());
562   if (err)
563   {
564     log_error("Error %d in line: %d of file %s\n", err, __LINE__, __FILE__);
565     return -1;
566   }
567 
568   return 0;
569 }
570 
verifyResults()571 int TestNonUniformWorkGroup::verifyResults () {
572   if (_localSize_IsNull) {
573     // for global work groups where local work group size is not defined (set to NULL in clEnqueueNDRangeKernel)
574     // we need to check what optimal size was chosen by device
575     // we assumed that local size value for work item 0 is right for the rest work items
576     _enqueuedLocalSize[0] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[0]);
577     _enqueuedLocalSize[1] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[1]);
578     _enqueuedLocalSize[2] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[2]);
579     calculateExpectedValues();
580 
581     // strict mode verification
582     if(_strictMode) {
583       size_t localWorkGroupSize = _enqueuedLocalSize[0]*_enqueuedLocalSize[1]*_enqueuedLocalSize[2];
584       if (localWorkGroupSize != TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device))
585           _err.show(Error::ERR_STRICT_MODE, "",localWorkGroupSize, TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device));
586     }
587 
588     log_info ("Local work group size calculated by driver: %s\n", showArray(_enqueuedLocalSize, _dims).c_str());
589  }
590 
591   for (cl_ushort i = 0; i < NUMBER_OF_REGIONS; ++i) {
592     verifyData(&_referenceRegionArray[i], &_resultsRegionArray[i], i);
593   }
594 
595   if (_testRange & Range::ATOMICS) {
596     if (_globalAtomicTestValue != _numOfGlobalWorkItems) {
597       _err.show(Error::ERR_GLOBAL_ATOMIC);
598     }
599   }
600 
601   if (_err.checkError())
602     return -1;
603 
604   return 0;
605 }
606 
showArray(const size_t * arr,cl_uint dims)607 std::string showArray (const size_t *arr, cl_uint dims) {
608   std::ostringstream tmpStringStream ("");
609 
610   tmpStringStream << "{";
611   for (cl_uint i=0; i < dims; i++) {
612     tmpStringStream << arr[i];
613     if (i+1 < dims)
614       tmpStringStream << ", ";
615   }
616   tmpStringStream << "}";
617 
618   return tmpStringStream.str();
619 }
620 
showTestInfo()621 void TestNonUniformWorkGroup::showTestInfo () {
622   std::string tmpString;
623   log_info ("T E S T  P A R A M E T E R S :\n");
624   log_info ("\tNumber of dimensions:\t%d\n", _dims);
625 
626   tmpString = showArray(_globalSize, _dims);
627 
628   log_info("\tGlobal work group size:\t%s\n", tmpString.c_str());
629 
630   if (!_localSize_IsNull) {
631     tmpString = showArray(_enqueuedLocalSize, _dims);
632   } else {
633     tmpString = "NULL";
634   }
635   log_info("\tLocal work group size:\t%s\n", tmpString.c_str());
636 
637   if (!_globalWorkOffset_IsNull) {
638     tmpString = showArray(_globalWorkOffset, _dims);
639   } else {
640     tmpString = "NULL";
641   }
642   log_info("\tGlobal work group offset:\t%s\n", tmpString.c_str());
643 
644   if (_reqdWorkGroupSize[0] != 0 && _reqdWorkGroupSize[1] != 0 && _reqdWorkGroupSize[2] != 0) {
645     tmpString = showArray(_reqdWorkGroupSize, _dims);
646   } else {
647     tmpString = "attribute disabled";
648   }
649   log_info ("\treqd_work_group_size attribute:\t%s\n", tmpString.c_str());
650 
651   tmpString = "";
652   if(_testRange & Range::BASIC)
653      tmpString += "basic";
654   if(_testRange & Range::ATOMICS) {
655     if(tmpString != "") tmpString += ", ";
656     tmpString += "atomics";
657   }
658   if(_testRange & Range::BARRIERS) {
659     if(tmpString != "") tmpString += ", ";
660     tmpString += "barriers";
661   }
662   log_info ("\tTest range:\t%s\n", tmpString.c_str());
663   if(_strictMode) {
664     log_info ("\tStrict mode:\tON\n");
665     if (!_localSize_IsNull) {
666       log_info ("\tATTENTION: strict mode applies only NULL local work group size\n");
667     } else {
668       log_info ("\t\tExpected value of local work group size is %ld.\n",
669         TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device));
670     }
671 
672   }
673 }
674 
adjustLocalArraySize(size_t localArraySize)675 size_t TestNonUniformWorkGroup::adjustLocalArraySize (size_t localArraySize) {
676   // In case if localArraySize is too big, sometimes we can not run kernel because of lack
677   // of resources due to kernel itself requires some local memory to run
678   int err;
679 
680   cl_ulong kernelLocalMemSize = 0;
681   err = clGetKernelWorkGroupInfo(_testKernel, _device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalMemSize), &kernelLocalMemSize, NULL);
682   test_error(err, "clGetKernelWorkGroupInfo failed");
683 
684   cl_ulong deviceLocalMemSize = 0;
685   err = clGetDeviceInfo(_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(deviceLocalMemSize), &deviceLocalMemSize, NULL);
686   test_error(err, "clGetDeviceInfo failed");
687 
688   if (kernelLocalMemSize + localArraySize > deviceLocalMemSize) {
689     size_t adjustedLocalArraySize = deviceLocalMemSize - kernelLocalMemSize;
690     log_info("localArraySize was adjusted from %lu to %lu\n", localArraySize, adjustedLocalArraySize);
691     localArraySize = adjustedLocalArraySize;
692   }
693 
694   return localArraySize;
695 }
696 
adjustGlobalBufferSize(size_t globalBufferSize)697 size_t TestNonUniformWorkGroup::adjustGlobalBufferSize(size_t globalBufferSize) {
698   // In case if global buffer size is too big, sometimes we can not run kernel because of lack
699   // of resources due to kernel itself requires some global memory to run
700   int err;
701 
702   cl_ulong deviceMaxAllocObjSize = 0;
703   err = clGetDeviceInfo(_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(deviceMaxAllocObjSize), &deviceMaxAllocObjSize, NULL);
704   test_error(err, "clGetDeviceInfo failed");
705 
706   size_t adjustedGlobalBufferSize = globalBufferSize;
707   if (deviceMaxAllocObjSize < globalBufferSize) {
708     adjustedGlobalBufferSize = deviceMaxAllocObjSize;
709     log_info("globalBufferSize was adjusted from %lu to %lu\n", globalBufferSize, adjustedGlobalBufferSize);
710   }
711 
712   return adjustedGlobalBufferSize;
713 }
714 
runKernel()715 int TestNonUniformWorkGroup::runKernel () {
716   int err;
717 
718   // TEST INFO
719   showTestInfo();
720 
721   size_t localArraySize = (_localSize_IsNull)?TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device):(_enqueuedLocalSize[0]*_enqueuedLocalSize[1]*_enqueuedLocalSize[2]);
722   clMemWrapper resultsRegionArray = clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _resultsRegionArray.size() * sizeof(DataContainerAttrib), &_resultsRegionArray.front(), &err);
723   test_error(err, "clCreateBuffer failed");
724 
725   size_t *localSizePtr = (_localSize_IsNull)?NULL:_enqueuedLocalSize;
726   size_t *globalWorkOffsetPtr = (_globalWorkOffset_IsNull)?NULL:_globalWorkOffset;
727 
728   err = clSetKernelArg(_testKernel, 0, sizeof(resultsRegionArray), &resultsRegionArray);
729   test_error(err, "clSetKernelArg failed");
730 
731   //creating local buffer
732   localArraySize = adjustLocalArraySize(localArraySize*sizeof(unsigned int));
733   err = clSetKernelArg(_testKernel, 1, localArraySize, NULL);
734   test_error(err, "clSetKernelArg failed");
735 
736   size_t globalBufferSize = adjustGlobalBufferSize(_numOfGlobalWorkItems*sizeof(cl_uint));
737   clMemWrapper testGlobalArray = clCreateBuffer(_context, CL_MEM_READ_WRITE, globalBufferSize, NULL, &err);
738   test_error(err, "clCreateBuffer failed");
739 
740   err = clSetKernelArg(_testKernel, 2, sizeof(testGlobalArray), &testGlobalArray);
741   test_error(err, "clSetKernelArg failed");
742 
743   _globalAtomicTestValue = 0;
744   clMemWrapper globalAtomicTestVariable = clCreateBuffer(_context, (CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR), sizeof(_globalAtomicTestValue), &_globalAtomicTestValue, &err);
745   test_error(err, "clCreateBuffer failed");
746 
747   err = clSetKernelArg(_testKernel, 3, sizeof(globalAtomicTestVariable), &globalAtomicTestVariable);
748   test_error(err, "clSetKernelArg failed");
749 
750   clMemWrapper errorArray = clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _err.errorArrayCounterSize(), _err.errorArrayCounter(), &err);
751   test_error(err, "clCreateBuffer failed");
752 
753   err = clSetKernelArg(_testKernel, 4, sizeof(errorArray), &errorArray);
754   test_error(err, "clSetKernelArg failed");
755 
756   err = clEnqueueNDRangeKernel(_queue, _testKernel, _dims, globalWorkOffsetPtr, _globalSize,
757     localSizePtr, 0, NULL, NULL);
758   test_error(err, "clEnqueueNDRangeKernel failed");
759 
760 
761   err = clFinish(_queue);
762   test_error(err, "clFinish failed");
763 
764   err = clEnqueueReadBuffer(_queue, globalAtomicTestVariable, CL_TRUE, 0, sizeof(unsigned int), &_globalAtomicTestValue, 0, NULL, NULL);
765   test_error(err, "clEnqueueReadBuffer failed");
766 
767   if (_err.checkError()) {
768     return -1;
769   }
770 
771   // synchronization of main buffer
772   err = clEnqueueReadBuffer(_queue, resultsRegionArray, CL_TRUE, 0, _resultsRegionArray.size() * sizeof(DataContainerAttrib), &_resultsRegionArray.front(), 0, NULL, NULL);
773   test_error(err, "clEnqueueReadBuffer failed");
774 
775   err = clEnqueueReadBuffer(_queue, errorArray, CL_TRUE, 0, _err.errorArrayCounterSize(), _err.errorArrayCounter(), 0, NULL, NULL);
776   test_error(err, "clEnqueueReadBuffer failed");
777   // Synchronization of errors occurred in kernel into general error stats
778   _err.synchronizeStatsMap();
779 
780   return 0;
781 }
782 
runTestNonUniformWorkGroup(const cl_uint dims,size_t * globalSize,const size_t * localSize,int range)783 void SubTestExecutor::runTestNonUniformWorkGroup(const cl_uint dims,
784                                                  size_t *globalSize,
785                                                  const size_t *localSize,
786                                                  int range)
787 {
788     runTestNonUniformWorkGroup(dims, globalSize, localSize, NULL, NULL, range);
789 }
790 
runTestNonUniformWorkGroup(const cl_uint dims,size_t * globalSize,const size_t * localSize,const size_t * globalWorkOffset,const size_t * reqdWorkGroupSize,int range)791 void SubTestExecutor::runTestNonUniformWorkGroup(
792     const cl_uint dims, size_t *globalSize, const size_t *localSize,
793     const size_t *globalWorkOffset, const size_t *reqdWorkGroupSize, int range)
794 {
795 
796 
797     int err;
798     ++_overallCounter;
799     TestNonUniformWorkGroup test(_device, _context, _queue, dims, globalSize,
800                                  localSize, NULL, globalWorkOffset,
801                                  reqdWorkGroupSize);
802 
803     test.setTestRange(range);
804     err = test.prepareDevice();
805     if (err)
806     {
807         log_error("Error: prepare device\n");
808         ++_failCounter;
809         return;
810     }
811 
812     err = test.runKernel();
813     if (err)
814     {
815         log_error("Error: run kernel\n");
816         ++_failCounter;
817         return;
818     }
819 
820     err = test.verifyResults();
821     if (err)
822     {
823         log_error("Error: verify results\n");
824         ++_failCounter;
825         return;
826     }
827 }
828 
calculateWorkGroupSize(size_t & maxWgSize,int testRange)829 int SubTestExecutor::calculateWorkGroupSize(size_t &maxWgSize, int testRange) {
830   int err;
831 
832   clProgramWrapper program;
833   clKernelWrapper testKernel;
834   std::string buildOptions{};
835 
836   if (testRange & Range::BASIC)
837     buildOptions += " -D TESTBASIC";
838   if (testRange & Range::ATOMICS)
839     buildOptions += " -D TESTATOMICS";
840   if (testRange & Range::BARRIERS)
841     buildOptions += " -D TESTBARRIERS";
842 
843   err = create_single_kernel_helper_with_build_options (_context, &program, &testKernel, 1,
844     &KERNEL_FUNCTION, "testKernel", buildOptions.c_str());
845   if (err)
846   {
847     log_error("Error %d in line: %d of file %s\n", err, __LINE__, __FILE__);
848     return err;
849   }
850 
851   err = clGetKernelWorkGroupInfo (testKernel, _device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxWgSize), &maxWgSize, NULL);
852   test_error(err, "clGetKernelWorkGroupInfo failed");
853 
854   TestNonUniformWorkGroup::setMaxLocalWorkgroupSize(maxWgSize);
855 
856   return 0;
857 }
858 
status()859 int SubTestExecutor::status() {
860 
861   if (_failCounter>0) {
862     log_error ("%d subtest(s) (of %d) failed\n", _failCounter, _overallCounter);
863     return -1;
864   } else {
865     log_info ("All %d subtest(s) passed\n", _overallCounter);
866     return 0;
867   }
868 }
869 
870