1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #ifndef _COMMON_H_
17 #define _COMMON_H_
18 
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/ThreadPool.h"
22 
23 #include "host_atomics.h"
24 
25 #include <vector>
26 #include <sstream>
27 
28 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
29 #define MAX_HOST_THREADS GetThreadCount()
30 
31 #define EXECUTE_TEST(error, test)\
32   error |= test;\
33   if(error && !gContinueOnError)\
34   return error;
35 
36 enum TExplicitAtomicType
37 {
38     TYPE_ATOMIC_INT,
39     TYPE_ATOMIC_UINT,
40     TYPE_ATOMIC_LONG,
41     TYPE_ATOMIC_ULONG,
42     TYPE_ATOMIC_FLOAT,
43     TYPE_ATOMIC_DOUBLE,
44     TYPE_ATOMIC_INTPTR_T,
45     TYPE_ATOMIC_UINTPTR_T,
46     TYPE_ATOMIC_SIZE_T,
47     TYPE_ATOMIC_PTRDIFF_T,
48     TYPE_ATOMIC_FLAG
49 };
50 
51 enum TExplicitMemoryScopeType
52 {
53     MEMORY_SCOPE_EMPTY,
54     MEMORY_SCOPE_WORK_GROUP,
55     MEMORY_SCOPE_DEVICE,
56     MEMORY_SCOPE_ALL_DEVICES, // Alias for MEMORY_SCOPE_ALL_SVM_DEVICES
57     MEMORY_SCOPE_ALL_SVM_DEVICES
58 };
59 
60 extern bool gHost; // temporary flag for testing native host threads (test verification)
61 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
62 extern bool gContinueOnError; // execute all cases even when errors detected
63 extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
64 extern bool gNoGenericAddressSpace; // disable cases with generic address space
65 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
66 extern bool gDebug; // print OpenCL kernel code
67 extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
68 extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
69 extern cl_device_atomic_capabilities gAtomicMemCap,
70     gAtomicFenceCap; // atomic memory and fence capabilities for this device
71 
72 extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
73 extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
74 
75 extern cl_int getSupportedMemoryOrdersAndScopes(
76     cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
77     std::vector<TExplicitMemoryScopeType> &memoryScopes);
78 
79 class AtomicTypeInfo
80 {
81 public:
82   TExplicitAtomicType _type;
AtomicTypeInfo(TExplicitAtomicType type)83   AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
84   cl_uint Size(cl_device_id device);
85   const char* AtomicTypeName();
86   const char* RegularTypeName();
87   const char* AddSubOperandTypeName();
88   int IsSupported(cl_device_id device);
89 };
90 
91 template<typename HostDataType>
92 class AtomicTypeExtendedInfo : public AtomicTypeInfo
93 {
94 public:
AtomicTypeExtendedInfo(TExplicitAtomicType type)95   AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
96   HostDataType MinValue();
97   HostDataType MaxValue();
SpecialValue(cl_uchar x)98   HostDataType SpecialValue(cl_uchar x)
99   {
100     HostDataType tmp;
101     cl_uchar *ptr = (cl_uchar*)&tmp;
102     for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
103       ptr[i] = x;
104     return tmp;
105   }
SpecialValue(cl_ushort x)106   HostDataType SpecialValue(cl_ushort x)
107   {
108     HostDataType tmp;
109     cl_ushort *ptr = (cl_ushort*)&tmp;
110     for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
111       ptr[i] = x;
112     return tmp;
113   }
114 };
115 
116 class CTest  {
117 public:
118   virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
119 };
120 
121 template<typename HostAtomicType, typename HostDataType>
122 class CBasicTest : CTest
123 {
124 public:
125   typedef struct {
126     CBasicTest *test;
127     cl_uint tid;
128     cl_uint threadCount;
129     volatile HostAtomicType *destMemory;
130     HostDataType *oldValues;
131   } THostThreadContext;
HostThreadFunction(cl_uint job_id,cl_uint thread_id,void * userInfo)132   static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
133   {
134     THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
135     threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
136     return 0;
137   }
CBasicTest(TExplicitAtomicType dataType,bool useSVM)138   CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
139     _maxDeviceThreads(MAX_DEVICE_THREADS),
140     _dataType(dataType), _useSVM(useSVM), _startValue(255),
141     _localMemory(false), _declaredInProgram(false),
142     _usedInFunction(false), _genericAddrSpace(false),
143     _oldValueCheck(true), _localRefValues(false),
144     _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
145   {
146   }
~CBasicTest()147   virtual ~CBasicTest()
148   {
149     if(_passCount)
150       log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
151   }
NumResults(cl_uint threadCount,cl_device_id deviceID)152   virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
153   {
154     return 1;
155   }
NumNonAtomicVariablesPerThread()156   virtual cl_uint NumNonAtomicVariablesPerThread()
157   {
158     return 1;
159   }
ExpectedValue(HostDataType & expected,cl_uint threadCount,HostDataType * startRefValues,cl_uint whichDestValue)160   virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
161   {
162     return false;
163   }
GenerateRefs(cl_uint threadCount,HostDataType * startRefValues,MTdata d)164   virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
165   {
166     return false;
167   }
VerifyRefs(bool & correct,cl_uint threadCount,HostDataType * refValues,HostAtomicType * finalValues)168   virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
169   {
170     return false;
171   }
172   virtual std::string PragmaHeader(cl_device_id deviceID);
173   virtual std::string ProgramHeader(cl_uint maxNumDestItems);
174   virtual std::string FunctionCode();
175   virtual std::string KernelCode(cl_uint maxNumDestItems);
176   virtual std::string ProgramCore() = 0;
SingleTestName()177   virtual std::string SingleTestName()
178   {
179     std::string testName = LocalMemory() ? "local" : "global";
180     testName += " ";
181     testName += DataType().AtomicTypeName();
182     if(DeclaredInProgram())
183     {
184       testName += " declared in program";
185     }
186     if(DeclaredInProgram() && UsedInFunction())
187       testName += ",";
188     if(UsedInFunction())
189     {
190       testName += " used in ";
191       if(GenericAddrSpace())
192         testName += "generic ";
193       testName += "function";
194     }
195     return testName;
196   }
197   virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
ExecuteForEachPointerType(cl_device_id deviceID,cl_context context,cl_command_queue queue)198   int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
199   {
200     int error = 0;
201     UsedInFunction(false);
202     EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
203     UsedInFunction(true);
204     GenericAddrSpace(false);
205     EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
206     GenericAddrSpace(true);
207     EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
208     GenericAddrSpace(false);
209     return error;
210   }
ExecuteForEachDeclarationType(cl_device_id deviceID,cl_context context,cl_command_queue queue)211   int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
212   {
213     int error = 0;
214     DeclaredInProgram(false);
215     EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
216     if(!UseSVM())
217     {
218       DeclaredInProgram(true);
219       EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
220     }
221     return error;
222   }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)223   virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
224   {
225     int error = 0;
226     if(_maxDeviceThreads > 0 && !UseSVM())
227     {
228       LocalMemory(true);
229       EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
230     }
231     if(_maxDeviceThreads+MaxHostThreads() > 0)
232     {
233       LocalMemory(false);
234       EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
235     }
236     return error;
237   }
Execute(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)238   virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
239   {
240     if(sizeof(HostAtomicType) != DataType().Size(deviceID))
241     {
242       log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
243       return -1;
244     }
245     if(sizeof(HostAtomicType) != sizeof(HostDataType))
246     {
247       log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
248       return -1;
249     }
250     // Verify we can run first
251     if(UseSVM() && !gUseHostPtr)
252     {
253       cl_device_svm_capabilities caps;
254       cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
255       test_error(error, "clGetDeviceInfo failed");
256       if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
257       {
258         log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
259         // implicit pass
260         return 0;
261       }
262     }
263     if(!DataType().IsSupported(deviceID))
264     {
265       log_info("\t%s not supported\n", DataType().AtomicTypeName());
266       // implicit pass or host test (debug feature)
267       if(UseSVM())
268         return 0;
269       _maxDeviceThreads = 0;
270     }
271     if(_maxDeviceThreads+MaxHostThreads() == 0)
272       return 0;
273     return ExecuteForEachParameterSet(deviceID, context, queue);
274   }
HostFunction(cl_uint tid,cl_uint threadCount,volatile HostAtomicType * destMemory,HostDataType * oldValues)275   virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
276   {
277     log_info("Empty thread function %u\n", (cl_uint)tid);
278   }
DataType()279   AtomicTypeExtendedInfo<HostDataType> DataType() const
280   {
281     return AtomicTypeExtendedInfo<HostDataType>(_dataType);
282   }
283   cl_uint _maxDeviceThreads;
MaxHostThreads()284   virtual cl_uint MaxHostThreads()
285   {
286     if(UseSVM() || gHost)
287       return MAX_HOST_THREADS;
288     else
289       return 0;
290   }
291 
CheckCapabilities(TExplicitMemoryScopeType memoryScope,TExplicitMemoryOrderType memoryOrder)292   int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
293                         TExplicitMemoryOrderType memoryOrder)
294   {
295       /*
296           Differentiation between atomic fence and other atomic operations
297           does not need to occur here.
298 
299           The initialisation of this test checks that the minimum required
300           capabilities are supported by this device.
301 
302           The following switches allow the test to skip if optional capabilites
303           are not supported by the device.
304         */
305       switch (memoryScope)
306       {
307           case MEMORY_SCOPE_EMPTY: {
308               break;
309           }
310           case MEMORY_SCOPE_WORK_GROUP: {
311               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
312               {
313                   return TEST_SKIPPED_ITSELF;
314               }
315               break;
316           }
317           case MEMORY_SCOPE_DEVICE: {
318               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
319               {
320                   return TEST_SKIPPED_ITSELF;
321               }
322               break;
323           }
324           case MEMORY_SCOPE_ALL_DEVICES: // fallthough
325           case MEMORY_SCOPE_ALL_SVM_DEVICES: {
326               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
327               {
328                   return TEST_SKIPPED_ITSELF;
329               }
330               break;
331           }
332           default: {
333               log_info("Invalid memory scope\n");
334               break;
335           }
336       }
337 
338       switch (memoryOrder)
339       {
340           case MEMORY_ORDER_EMPTY: {
341               break;
342           }
343           case MEMORY_ORDER_RELAXED: {
344               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
345               {
346                   return TEST_SKIPPED_ITSELF;
347               }
348               break;
349           }
350           case MEMORY_ORDER_ACQUIRE:
351           case MEMORY_ORDER_RELEASE:
352           case MEMORY_ORDER_ACQ_REL: {
353               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
354               {
355                   return TEST_SKIPPED_ITSELF;
356               }
357               break;
358           }
359           case MEMORY_ORDER_SEQ_CST: {
360               if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
361               {
362                   return TEST_SKIPPED_ITSELF;
363               }
364               break;
365           }
366           default: {
367               log_info("Invalid memory order\n");
368               break;
369           }
370       }
371 
372       return 0;
373   }
SVMDataBufferAllSVMConsistent()374   virtual bool SVMDataBufferAllSVMConsistent() {return false;}
UseSVM()375   bool UseSVM() {return _useSVM;}
StartValue(HostDataType startValue)376   void StartValue(HostDataType startValue) {_startValue = startValue;}
StartValue()377   HostDataType StartValue() {return _startValue;}
LocalMemory(bool local)378   void LocalMemory(bool local) {_localMemory = local;}
LocalMemory()379   bool LocalMemory() {return _localMemory;}
DeclaredInProgram(bool declaredInProgram)380   void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
DeclaredInProgram()381   bool DeclaredInProgram() {return _declaredInProgram;}
UsedInFunction(bool local)382   void UsedInFunction(bool local) {_usedInFunction = local;}
UsedInFunction()383   bool UsedInFunction() {return _usedInFunction;}
GenericAddrSpace(bool genericAddrSpace)384   void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
GenericAddrSpace()385   bool GenericAddrSpace() {return _genericAddrSpace;}
OldValueCheck(bool check)386   void OldValueCheck(bool check) {_oldValueCheck = check;}
OldValueCheck()387   bool OldValueCheck() {return _oldValueCheck;}
LocalRefValues(bool localRefValues)388   void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
LocalRefValues()389   bool LocalRefValues() {return _localRefValues;}
MaxGroupSize(cl_uint maxGroupSize)390   void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
MaxGroupSize()391   cl_uint MaxGroupSize() {return _maxGroupSize;}
CurrentGroupSize(cl_uint currentGroupSize)392   void CurrentGroupSize(cl_uint currentGroupSize)
393   {
394     if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
395       _currentGroupSize = MaxGroupSize();
396     else
397       _currentGroupSize = currentGroupSize;
398   }
CurrentGroupSize()399   cl_uint CurrentGroupSize() {return _currentGroupSize;}
CurrentGroupNum(cl_uint threadCount)400   virtual cl_uint CurrentGroupNum(cl_uint threadCount)
401   {
402     if(threadCount == 0)
403       return 0;
404     if(LocalMemory())
405       return 1;
406     return threadCount/CurrentGroupSize();
407   }
Iterations()408   cl_int Iterations() {return _iterations;}
IterationsStr()409   std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
410 private:
411   const TExplicitAtomicType _dataType;
412   const bool _useSVM;
413   HostDataType	_startValue;
414   bool _localMemory;
415   bool _declaredInProgram;
416   bool _usedInFunction;
417   bool _genericAddrSpace;
418   bool _oldValueCheck;
419   bool _localRefValues;
420   cl_uint _maxGroupSize;
421   cl_uint _currentGroupSize;
422   cl_uint _passCount;
423   const cl_int _iterations;
424 };
425 
426 template<typename HostAtomicType, typename HostDataType>
427 class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
428 {
429 public:
430   using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
431   using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
432   using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
433   CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
434   {
435   }
ProgramHeader(cl_uint maxNumDestItems)436   virtual std::string ProgramHeader(cl_uint maxNumDestItems)
437   {
438     std::string header;
439     if(gOldAPI)
440     {
441       std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
442       header +=
443         "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
444         "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
445         "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
446         "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
447         "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
448         "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
449         "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
450         "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
451         "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
452         "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
453         "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
454         "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
455         "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
456         "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
457     }
458     return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
459   }
SingleTestName()460   virtual std::string SingleTestName()
461   {
462     std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
463     if(MemoryOrder() != MEMORY_ORDER_EMPTY)
464     {
465       testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
466     }
467     if(MemoryScope() != MEMORY_SCOPE_EMPTY)
468     {
469       testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
470     }
471     return testName;
472   }
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)473   virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
474   {
475     if(LocalMemory() &&
476       MemoryScope() != MEMORY_SCOPE_EMPTY &&
477       MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
478       return 0;
479     if(MemoryScope() == MEMORY_SCOPE_DEVICE)
480       MaxGroupSize(16); // increase number of groups by forcing smaller group size
481     else
482       MaxGroupSize(0); // group size limited by device capabilities
483 
484     if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
485         return 0; // skip test - not applicable
486 
487     return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
488   }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)489   virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
490   {
491     // repeat test for each reasonable memory order/scope combination
492     std::vector<TExplicitMemoryOrderType> memoryOrder;
493     std::vector<TExplicitMemoryScopeType> memoryScope;
494     int error = 0;
495 
496     // For OpenCL-3.0 and later some orderings and scopes are optional, so here
497     // we query for the supported ones.
498     test_error_ret(
499         getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
500         "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
501 
502     for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
503     {
504       for(unsigned si = 0; si < memoryScope.size(); si++)
505       {
506         if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
507           continue;
508         MemoryOrder(memoryOrder[oi]);
509         MemoryScope(memoryScope[si]);
510         EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
511       }
512     }
513     return error;
514   }
MemoryOrder(TExplicitMemoryOrderType memoryOrder)515   void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
MemoryOrder()516   TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
MemoryOrderStr()517   std::string MemoryOrderStr()
518   {
519     if(MemoryOrder() != MEMORY_ORDER_EMPTY)
520       return std::string(", ")+get_memory_order_type_name(MemoryOrder());
521     return "";
522   }
MemoryScope(TExplicitMemoryScopeType memoryScope)523   void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
MemoryScope()524   TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
MemoryScopeStr()525   std::string MemoryScopeStr()
526   {
527     if(MemoryScope() != MEMORY_SCOPE_EMPTY)
528       return std::string(", ")+get_memory_scope_type_name(MemoryScope());
529     return "";
530   }
MemoryOrderScopeStr()531   std::string MemoryOrderScopeStr()
532   {
533     return MemoryOrderStr()+MemoryScopeStr();
534   }
CurrentGroupNum(cl_uint threadCount)535   virtual cl_uint CurrentGroupNum(cl_uint threadCount)
536   {
537     if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
538       return 1;
539     return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
540   }
MaxHostThreads()541   virtual cl_uint MaxHostThreads()
542   {
543       // block host threads execution for memory scope different than
544       // memory_scope_all_svm_devices
545       if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
546           || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
547       {
548           return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
549       }
550       else
551       {
552           return 0;
553       }
554   }
555 private:
556   TExplicitMemoryOrderType _memoryOrder;
557   TExplicitMemoryScopeType _memoryScope;
558 };
559 
560 template<typename HostAtomicType, typename HostDataType>
561 class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
562 {
563 public:
564   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
565   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
566   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
567   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
568   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
569   using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
570 
571   CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
572   {
573   }
SingleTestName()574   virtual std::string SingleTestName()
575   {
576     std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
577     if(MemoryOrder() != MEMORY_ORDER_EMPTY)
578       testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
579     if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
580       testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
581     if(MemoryScope() != MEMORY_SCOPE_EMPTY)
582       testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
583     return testName;
584   }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)585   virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
586   {
587     // repeat test for each reasonable memory order/scope combination
588     std::vector<TExplicitMemoryOrderType> memoryOrder;
589     std::vector<TExplicitMemoryScopeType> memoryScope;
590     int error = 0;
591 
592     // For OpenCL-3.0 and later some orderings and scopes are optional, so here
593     // we query for the supported ones.
594     test_error_ret(
595         getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
596         "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
597 
598     for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
599     {
600       for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
601       {
602         for(unsigned si = 0; si < memoryScope.size(); si++)
603         {
604           if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
605             && memoryOrder[oi] != memoryOrder[o2i])
606             continue; // both memory order arguments must be set (or none)
607           if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
608             && memoryScope[si] != MEMORY_SCOPE_EMPTY)
609             continue; // memory scope without memory order is not allowed
610           MemoryOrder(memoryOrder[oi]);
611           MemoryOrder2(memoryOrder[o2i]);
612           MemoryScope(memoryScope[si]);
613 
614           if (CheckCapabilities(MemoryScope(), MemoryOrder())
615               == TEST_SKIPPED_ITSELF)
616               continue; // skip test - not applicable
617 
618           if (CheckCapabilities(MemoryScope(), MemoryOrder2())
619               == TEST_SKIPPED_ITSELF)
620               continue; // skip test - not applicable
621 
622           EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
623         }
624       }
625     }
626     return error;
627   }
MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)628   void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
MemoryOrder2()629   TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
MemoryOrderFailStr()630   std::string MemoryOrderFailStr()
631   {
632     if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
633       return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
634     return "";
635   }
MemoryOrderScope()636   std::string MemoryOrderScope()
637   {
638     return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
639   }
640 private:
641   TExplicitMemoryOrderType _memoryOrder2;
642 };
643 
644 template<typename HostAtomicType, typename HostDataType>
PragmaHeader(cl_device_id deviceID)645 std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
646 {
647   std::string pragma;
648 
649   if(gOldAPI)
650   {
651     pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
652     pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
653     pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
654     pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
655   }
656   // Create the pragma lines for this kernel
657   if(DataType().Size(deviceID) == 8)
658   {
659     pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
660     pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
661   }
662   if(_dataType == TYPE_ATOMIC_DOUBLE)
663     pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
664   return pragma;
665 }
666 
667 template<typename HostAtomicType, typename HostDataType>
ProgramHeader(cl_uint maxNumDestItems)668 std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
669 {
670   // Create the program header
671   std::string header;
672   std::string aTypeName = DataType().AtomicTypeName();
673   std::string cTypeName = DataType().RegularTypeName();
674   std::string argListForKernel;
675   std::string argListForFunction;
676   std::string argListNoTypes;
677   std::string functionPrototype;
678   std::string addressSpace = LocalMemory() ? "__local " : "__global ";
679 
680   if(gOldAPI)
681   {
682     header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
683       "#define atomic_store(x,y)                                (*(x) = y)\n"
684       "#define atomic_load(x)                                   (*(x))\n"
685       "#define ATOMIC_VAR_INIT(x)                               (x)\n"
686       "#define ATOMIC_FLAG_INIT                                 0\n"
687       "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
688     if(aTypeName == "atomic_float")
689       header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
690     else if(aTypeName == "atomic_double")
691       header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
692         "{\n"
693         "  long tmp = *(long*)&y, res;\n"
694         "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
695         "  res = atom_xchg(tmpA,tmp);\n"
696         "  return *(double*)&res;\n"
697         "}\n";
698     else
699       header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
700     if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
701       header +=
702       "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
703       "{\n"
704       "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
705       "  if(old == *expected)\n"
706       "    return true;\n"
707       "  *expected = old;\n"
708       "  return false;\n"
709       "}\n"
710       "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
711     header +=
712       "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
713       "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
714       "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
715       "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
716       "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
717       "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
718       "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
719       "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
720       "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
721       "\n";
722   }
723   if(!LocalMemory() && DeclaredInProgram())
724   {
725     // additional atomic variable for results copying (last thread will do this)
726     header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
727     // atomic variables declared in program scope - test data
728     std::stringstream ss;
729     ss << maxNumDestItems;
730     header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
731     ss.str("");
732     ss << _startValue;
733     for(cl_uint i = 0; i < maxNumDestItems; i++)
734     {
735       if(aTypeName == "atomic_flag")
736         header +=  "  ATOMIC_FLAG_INIT";
737       else
738         header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
739       if(i+1 < maxNumDestItems)
740         header += ",";
741       header += "\n";
742     }
743     header+=
744       "};\n"
745       "\n";
746   }
747   return header;
748 }
749 
750 template<typename HostAtomicType, typename HostDataType>
FunctionCode()751 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
752 {
753   if(!UsedInFunction())
754     return "";
755   std::string addressSpace = LocalMemory() ? "__local " : "__global ";
756   std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
757   if(!GenericAddrSpace())
758     code += addressSpace;
759   code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
760     " *oldValues";
761   if(LocalRefValues())
762     code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
763   code += ")\n"
764     "{\n";
765   code += ProgramCore();
766   code += "}\n"
767     "\n";
768   return code;
769 }
770 
771 template<typename HostAtomicType, typename HostDataType>
KernelCode(cl_uint maxNumDestItems)772 std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
773 {
774   std::string aTypeName = DataType().AtomicTypeName();
775   std::string cTypeName = DataType().RegularTypeName();
776   std::string addressSpace = LocalMemory() ? "__local " : "__global ";
777   std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
778 
779   // prepare list of arguments for kernel
780   if(LocalMemory())
781   {
782     code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
783       " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
784   }
785   else
786   {
787     code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
788       ", __global "+cTypeName+" *oldValues";
789   }
790   if(LocalRefValues())
791     code += std::string(", __local ")+cTypeName+" *localValues";
792   code += ")\n"
793     "{\n";
794   if(LocalMemory() && DeclaredInProgram())
795   {
796     // local atomics declared in kernel scope
797     std::stringstream ss;
798     ss << maxNumDestItems;
799     code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
800   }
801   code += "  uint  tid = get_global_id(0);\n"
802     "\n";
803   if(LocalMemory())
804   {
805       // memory_order_relaxed is sufficient for these initialization operations
806       // as the barrier below will act as a fence, providing an order to the
807       // operations. memory_scope_work_group is sufficient as local memory is
808       // only visible within the work-group.
809       code += R"(
810               // initialize atomics not reachable from host (first thread
811               // is doing this, other threads are waiting on barrier)
812               if(get_local_id(0) == 0)
813                 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
814                 {)";
815       if (aTypeName == "atomic_flag")
816       {
817           code += R"(
818                   if(finalDest[dstItemIdx])
819                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
820                                                       memory_order_relaxed,
821                                                       memory_scope_work_group);
822                   else
823                     atomic_flag_clear_explicit(destMemory+dstItemIdx,
824                                                memory_order_relaxed,
825                                                memory_scope_work_group);)";
826       }
827     else
828     {
829         code += R"(
830                 atomic_store_explicit(destMemory+dstItemIdx,
831                                       finalDest[dstItemIdx],
832                                       memory_order_relaxed,
833                                       memory_scope_work_group);)";
834     }
835     code +=
836       "    }\n"
837       "  barrier(CLK_LOCAL_MEM_FENCE);\n"
838       "\n";
839   }
840   if (LocalRefValues())
841   {
842     code +=
843       "  // Copy input reference values into local memory\n";
844     if (NumNonAtomicVariablesPerThread() == 1)
845       code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
846     else
847     {
848       std::stringstream ss;
849       ss << NumNonAtomicVariablesPerThread();
850       code +=
851         "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
852         "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
853     }
854     code +=
855       "  barrier(CLK_LOCAL_MEM_FENCE);\n"
856       "\n";
857   }
858   if (UsedInFunction())
859     code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
860     (LocalRefValues() ? ", localValues" : "")+");\n";
861   else
862     code += ProgramCore();
863   code += "\n";
864   if (LocalRefValues())
865   {
866     code +=
867       "  // Copy local reference values into output array\n"
868       "  barrier(CLK_LOCAL_MEM_FENCE);\n";
869     if (NumNonAtomicVariablesPerThread() == 1)
870       code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
871     else
872     {
873       std::stringstream ss;
874       ss << NumNonAtomicVariablesPerThread();
875       code +=
876         "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
877         "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
878     }
879     code += "\n";
880   }
881   if(LocalMemory() || DeclaredInProgram())
882   {
883     code += "  // Copy final values to host reachable buffer\n";
884     if(LocalMemory())
885       code +=
886         "  barrier(CLK_LOCAL_MEM_FENCE);\n"
887         "  if(get_local_id(0) == 0) // first thread in workgroup\n";
888     else
889       // global atomics declared in program scope
890       code += R"(
891                 if(atomic_fetch_add_explicit(&finishedThreads, 1u,
892                                            memory_order_relaxed,
893                                            memory_scope_work_group)
894                    == get_global_size(0)-1) // last finished thread
895                    )";
896     code +=
897         "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
898     if(aTypeName == "atomic_flag")
899     {
900         code += R"(
901                 finalDest[dstItemIdx] =
902                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
903                                                       memory_order_relaxed,
904                                                       memory_scope_work_group);)";
905     }
906     else
907     {
908         code += R"(
909                 finalDest[dstItemIdx] =
910                     atomic_load_explicit(destMemory+dstItemIdx,
911                                          memory_order_relaxed,
912                                          memory_scope_work_group);)";
913     }
914   }
915   code += "}\n"
916     "\n";
917   return code;
918 }
919 
920 template <typename HostAtomicType, typename HostDataType>
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)921 int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
922 {
923   int error;
924   clProgramWrapper program;
925   clKernelWrapper kernel;
926   size_t threadNum[1];
927   clMemWrapper streams[2];
928   std::vector<HostAtomicType> destItems;
929   HostAtomicType *svmAtomicBuffer = 0;
930   std::vector<HostDataType> refValues, startRefValues;
931   HostDataType *svmDataBuffer = 0;
932   cl_uint deviceThreadCount, hostThreadCount, threadCount;
933   size_t groupSize = 0;
934   std::string programSource;
935   const char *programLine;
936   MTdata d;
937   size_t typeSize = DataType().Size(deviceID);
938 
939   deviceThreadCount = _maxDeviceThreads;
940   hostThreadCount = MaxHostThreads();
941   threadCount = deviceThreadCount+hostThreadCount;
942 
943   //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
944   log_info("\t%s...\n", SingleTestName().c_str());
945 
946   if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
947   {
948     log_info("\t\tTest disabled\n");
949     return 0;
950   }
951   if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
952   {
953     log_info("\t\tTest disabled\n");
954     return 0;
955   }
956 
957   // set up work sizes based on device capabilities and test configuration
958   error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
959   test_error(error, "Unable to obtain max work group size for device");
960   CurrentGroupSize((cl_uint)groupSize);
961   if(CurrentGroupSize() > deviceThreadCount)
962     CurrentGroupSize(deviceThreadCount);
963   if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
964     deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
965   threadCount = deviceThreadCount+hostThreadCount;
966 
967   // If we're given a num_results function, we need to determine how many result objects we need.
968   // This is the first assessment for current maximum number of threads (exact thread count is not known here)
969   // - needed for program source code generation (arrays of atomics declared in program)
970   cl_uint numDestItems = NumResults(threadCount, deviceID);
971 
972   if(deviceThreadCount > 0)
973   {
974       // This loop iteratively reduces the workgroup size by 2 and then
975       // re-generates the kernel with the reduced
976       // workgroup size until we find a size which is admissible for the kernel
977       // being run or reduce the wg size
978       // to the trivial case of 1 (which was separately verified to be accurate
979       // for the kernel being run)
980 
981       while ((CurrentGroupSize() > 1))
982       {
983           // Re-generate the kernel code with the current group size
984           if (kernel) clReleaseKernel(kernel);
985           if (program) clReleaseProgram(program);
986           programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
987               + FunctionCode() + KernelCode(numDestItems);
988           programLine = programSource.c_str();
989           if (create_single_kernel_helper_with_build_options(
990                   context, &program, &kernel, 1, &programLine,
991                   "test_atomic_kernel", gOldAPI ? "" : nullptr))
992           {
993               return -1;
994           }
995           // Get work group size for the new kernel
996           error = clGetKernelWorkGroupInfo(kernel, deviceID,
997                                            CL_KERNEL_WORK_GROUP_SIZE,
998                                            sizeof(groupSize), &groupSize, NULL);
999           test_error(error,
1000                      "Unable to obtain max work group size for device and "
1001                      "kernel combo");
1002 
1003           if (LocalMemory())
1004           {
1005               cl_ulong usedLocalMemory;
1006               cl_ulong totalLocalMemory;
1007               cl_uint maxWorkGroupSize;
1008 
1009               error = clGetKernelWorkGroupInfo(
1010                   kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
1011                   sizeof(usedLocalMemory), &usedLocalMemory, NULL);
1012               test_error(error, "clGetKernelWorkGroupInfo failed");
1013 
1014               error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
1015                                       sizeof(totalLocalMemory),
1016                                       &totalLocalMemory, NULL);
1017               test_error(error, "clGetDeviceInfo failed");
1018 
1019               // We know that each work-group is going to use typeSize *
1020               // deviceThreadCount bytes of local memory
1021               // so pick the maximum value for deviceThreadCount that uses all
1022               // the local memory.
1023               maxWorkGroupSize =
1024                   ((totalLocalMemory - usedLocalMemory) / typeSize);
1025 
1026               if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
1027           }
1028           if (CurrentGroupSize() <= groupSize)
1029               break;
1030           else
1031               CurrentGroupSize(CurrentGroupSize() / 2);
1032       }
1033     if(CurrentGroupSize() > deviceThreadCount)
1034       CurrentGroupSize(deviceThreadCount);
1035     if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1036       deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
1037     threadCount = deviceThreadCount+hostThreadCount;
1038   }
1039   if (gDebug)
1040   {
1041       log_info("Program source:\n");
1042       log_info("%s\n", programLine);
1043   }
1044   if(deviceThreadCount > 0)
1045     log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
1046   if(hostThreadCount > 0)
1047     log_info("\t\t(host threads %u)\n", hostThreadCount);
1048 
1049   refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
1050 
1051   // Generate ref data if we have a ref generator provided
1052   d = init_genrand(gRandomSeed);
1053   startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
1054   if(GenerateRefs(threadCount, &startRefValues[0], d))
1055   {
1056     //copy ref values for host threads
1057     memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
1058   }
1059   else
1060   {
1061     startRefValues.resize(0);
1062   }
1063   free_mtdata(d);
1064   d = NULL;
1065 
1066   // If we're given a num_results function, we need to determine how many result objects we need. If
1067   // we don't have it, we assume it's just 1
1068   // This is final value (exact thread count is known in this place)
1069   numDestItems = NumResults(threadCount, deviceID);
1070 
1071   destItems.resize(numDestItems);
1072   for(cl_uint i = 0; i < numDestItems; i++)
1073     destItems[i] = _startValue;
1074 
1075   // Create main buffer with atomic variables (array size dependent on particular test)
1076   if(UseSVM())
1077   {
1078     if(gUseHostPtr)
1079       svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
1080     else
1081       svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
1082     if(!svmAtomicBuffer)
1083     {
1084       log_error("ERROR: clSVMAlloc failed!\n");
1085       return -1;
1086     }
1087     memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
1088     streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1089                                 typeSize * numDestItems, svmAtomicBuffer, NULL);
1090   }
1091   else
1092   {
1093       streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
1094                                   typeSize * numDestItems, &destItems[0], NULL);
1095   }
1096   if (!streams[0])
1097   {
1098     log_error("ERROR: Creating output array failed!\n");
1099     return -1;
1100   }
1101   // Create buffer for per-thread input/output data
1102   if(UseSVM())
1103   {
1104     if(gUseHostPtr)
1105       svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
1106     else
1107       svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
1108     if(!svmDataBuffer)
1109     {
1110       log_error("ERROR: clSVMAlloc failed!\n");
1111       return -1;
1112     }
1113     if(startRefValues.size())
1114       memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
1115     streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1116                                 typeSize * threadCount
1117                                     * NumNonAtomicVariablesPerThread(),
1118                                 svmDataBuffer, NULL);
1119   }
1120   else
1121   {
1122       streams[1] = clCreateBuffer(
1123           context,
1124           ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
1125           typeSize * threadCount * NumNonAtomicVariablesPerThread(),
1126           startRefValues.size() ? &startRefValues[0] : 0, NULL);
1127   }
1128   if (!streams[1])
1129   {
1130     log_error("ERROR: Creating reference array failed!\n");
1131     return -1;
1132   }
1133   if(deviceThreadCount > 0)
1134   {
1135     cl_uint argInd = 0;
1136     /* Set the arguments */
1137     error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
1138     test_error(error, "Unable to set kernel argument");
1139     error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
1140     test_error(error, "Unable to set indexed kernel argument");
1141     error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
1142     test_error(error, "Unable to set indexed kernel arguments");
1143     error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
1144     test_error(error, "Unable to set indexed kernel arguments");
1145     if(LocalMemory())
1146     {
1147       error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
1148       test_error(error, "Unable to set indexed local kernel argument");
1149     }
1150     if(LocalRefValues())
1151     {
1152       error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
1153       test_error(error, "Unable to set indexed kernel argument");
1154     }
1155   }
1156   /* Configure host threads */
1157   std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
1158   for(unsigned int t = 0; t < hostThreadCount; t++)
1159   {
1160     hostThreadContexts[t].test = this;
1161     hostThreadContexts[t].tid = deviceThreadCount+t;
1162     hostThreadContexts[t].threadCount = threadCount;
1163     hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
1164     hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
1165   }
1166 
1167   if(deviceThreadCount > 0)
1168   {
1169     /* Run the kernel */
1170     threadNum[0] = deviceThreadCount;
1171     groupSize = CurrentGroupSize();
1172     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
1173     test_error(error, "Unable to execute test kernel");
1174     /* start device threads */
1175     error = clFlush(queue);
1176     test_error(error, "clFlush failed");
1177   }
1178 
1179   /* Start host threads and wait for finish */
1180   if(hostThreadCount > 0)
1181     ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
1182 
1183   if(UseSVM())
1184   {
1185     error = clFinish(queue);
1186     test_error(error, "clFinish failed");
1187     memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
1188     memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
1189   }
1190   else
1191   {
1192     if(deviceThreadCount > 0)
1193     {
1194       error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
1195       test_error(error, "Unable to read result value!");
1196       error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
1197       test_error(error, "Unable to read reference values!");
1198     }
1199   }
1200   bool dataVerified = false;
1201   // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
1202   // have one, it's because we're comparing ref values only
1203   for(cl_uint i = 0; i < numDestItems; i++)
1204   {
1205     HostDataType expected;
1206 
1207     if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
1208       break; // no expected value function provided
1209 
1210     if(expected != destItems[i])
1211     {
1212       std::stringstream logLine;
1213       logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
1214       log_error("%s", logLine.str().c_str());
1215       for(i = 0; i < threadCount; i++)
1216       {
1217         logLine.str("");
1218         logLine << " --- " << i << " - ";
1219         if(startRefValues.size())
1220           logLine << startRefValues[i] << " -> " << refValues[i];
1221         else
1222           logLine << refValues[i];
1223         logLine << " --- ";
1224         if(i < numDestItems)
1225           logLine << destItems[i];
1226         logLine << "\n";
1227         log_info("%s", logLine.str().c_str());
1228       }
1229       if(!gDebug)
1230       {
1231         log_info("Program source:\n");
1232         log_info("%s\n", programLine);
1233       }
1234       return -1;
1235     }
1236     dataVerified = true;
1237   }
1238 
1239   bool dataCorrect = false;
1240   /* Use the verify function (if provided) to also check the results */
1241   if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
1242   {
1243     if(!dataCorrect)
1244     {
1245       log_error("ERROR: Reference values did not validate!\n");
1246       std::stringstream logLine;
1247       for(cl_uint i = 0; i < threadCount; i++)
1248       for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
1249       {
1250         logLine.str("");
1251         logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
1252         if(j == 0 && i < numDestItems)
1253           logLine << destItems[i];
1254         logLine << "\n";
1255         log_info("%s", logLine.str().c_str());
1256       }
1257       if(!gDebug)
1258       {
1259         log_info("Program source:\n");
1260         log_info("%s\n", programLine);
1261       }
1262       return -1;
1263     }
1264   }
1265   else if(!dataVerified)
1266   {
1267     log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
1268     return -1;
1269   }
1270 
1271   if(OldValueCheck() &&
1272     !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
1273                                              // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
1274   {
1275     /* Re-write the starting value */
1276     for(size_t i = 0; i < numDestItems; i++)
1277       destItems[i] = _startValue;
1278     refValues[0] = 0;
1279     if(deviceThreadCount > 0)
1280     {
1281       error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
1282       test_error(error, "Unable to write starting values!");
1283 
1284       /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
1285       threadNum[0] = 1;
1286       error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
1287       test_error(error, "Unable to execute test kernel");
1288 
1289       error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
1290       test_error(error, "Unable to read reference values!");
1291     }
1292     else
1293     {
1294       /* Start host thread */
1295       HostFunction(0, 1, &destItems[0], &refValues[0]);
1296     }
1297 
1298     if(refValues[0] != _startValue)//destItems[0])
1299     {
1300       std::stringstream logLine;
1301       logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
1302         " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
1303       log_error("%s", logLine.str().c_str());
1304       if(!gDebug)
1305       {
1306         log_info("Program source:\n");
1307         log_info("%s\n", programLine);
1308       }
1309       return -1;
1310     }
1311   }
1312   if(UseSVM())
1313   {
1314     // the buffer object must first be released before the SVM buffer is freed
1315     error = clReleaseMemObject(streams[0]);
1316     streams[0] = 0;
1317     test_error(error, "clReleaseMemObject failed");
1318     if(gUseHostPtr)
1319       free(svmAtomicBuffer);
1320     else
1321       clSVMFree(context, svmAtomicBuffer);
1322     error = clReleaseMemObject(streams[1]);
1323     streams[1] = 0;
1324     test_error(error, "clReleaseMemObject failed");
1325     if(gUseHostPtr)
1326       free(svmDataBuffer);
1327     else
1328       clSVMFree(context, svmDataBuffer);
1329   }
1330   _passCount++;
1331   return 0;
1332 }
1333 
1334 #endif //_COMMON_H_
1335