1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/conversions.h"
18 #ifndef _WIN32
19 #include <unistd.h>
20 #endif
21 
22 #define INT_TEST_VALUE 402258822
23 #define LONG_TEST_VALUE 515154531254381446LL
24 
25 
26 const char *atomic_global_pattern[] = {
27     "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
28     "{\n"
29     "    int  tid = get_global_id(0);\n"
30     "\n"
31     ,
32     "\n"
33     "}\n" };
34 
35 const char *atomic_local_pattern[] = {
36     "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
37     "{\n"
38     "    int  tid = get_global_id(0);\n"
39     "     int  dstItemIdx;\n"
40     "\n"
41     "    // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
42     "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
43     "        destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
44     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
45     "\n"
46     ,
47     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
48     "    // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
49     "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
50     "        finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
51     "}\n" };
52 
53 
54 #define TEST_COUNT 128 * 1024
55 
56 
57 struct TestFns
58 {
59     cl_int    mIntStartValue;
60     cl_long    mLongStartValue;
61 
62     size_t    (*NumResultsFn)( size_t threadSize, ExplicitType dataType );
63 
64     // Integer versions
65     cl_int    (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue );
66     void    (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d );
67     bool    (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue );
68 
69     // Long versions
70     cl_long    (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue );
71     void    (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d );
72     bool    (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue );
73 
74     // Float versions
75     cl_float    (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue );
76     void        (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d );
77     bool        (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue );
78 };
79 
check_atomic_support(cl_device_id device,bool extended,bool isLocal,ExplicitType dataType)80 bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType )
81 {
82     const char *extensionNames[8] = {
83         "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
84         "cl_khr_local_int32_base_atomics",  "cl_khr_local_int32_extended_atomics",
85         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics",
86         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics"       // this line intended to be the same as the last one
87     };
88 
89     size_t index = 0;
90     if( extended )
91         index += 1;
92     if( isLocal )
93         index += 2;
94 
95     Version version = get_device_cl_version(device);
96 
97     switch (dataType)
98     {
99         case kInt:
100         case kUInt:
101             if( version >= Version(1,1) )
102                 return 1;
103             break;
104         case kLong:
105         case kULong:
106             index += 4;
107             break;
108         case kFloat:  // this has to stay separate since the float atomics arent in the 1.0 extensions
109             return version >= Version(1,1);
110         default:
111             log_error( "ERROR:  Unsupported data type (%d) in check_atomic_support\n", dataType );
112             return 0;
113     }
114 
115     return is_extension_available( device, extensionNames[index] );
116 }
117 
test_atomic_function(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool isLocal,ExplicitType dataType,bool matchGroupSize)118 int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
119                          TestFns testFns,
120                          bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize )
121 {
122     clProgramWrapper program;
123     clKernelWrapper kernel;
124     int error;
125     size_t threads[1];
126     clMemWrapper streams[2];
127     void *refValues, *startRefValues;
128     size_t threadSize, groupSize;
129     const char *programLines[4];
130     char pragma[ 512 ];
131     char programHeader[ 512 ];
132     MTdata d;
133     size_t typeSize = get_explicit_type_size( dataType );
134 
135 
136     // Verify we can run first
137     bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt );
138     if( !check_atomic_support( deviceID, extended, isLocal, dataType ) )
139     {
140         // Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either)
141         if( dataType == kFloat )
142             log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" );
143         else if( !isUnsigned )
144             log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 );
145         // Since we don't support the operation, they implicitly pass
146         return 0;
147     }
148     else
149     {
150         if( dataType == kFloat )
151             log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" );
152         else
153             log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "",
154                      (int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" );
155     }
156 
157     //// Set up the kernel code
158 
159     // Create the pragma line for this kernel
160     bool isLong = ( dataType == kLong || dataType == kULong );
161     sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
162             isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32",
163             extended ? "extended" : "base" );
164 
165     // Now create the program header
166     const char *typeName = get_explicit_type_name( dataType );
167     if( isLocal )
168         sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName );
169     else
170         sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName );
171 
172     // Set up our entire program now
173     programLines[ 0 ] = pragma;
174     programLines[ 1 ] = programHeader;
175     programLines[ 2 ] = programCore;
176     programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ];
177 
178     if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) )
179     {
180         return -1;
181     }
182 
183     //// Set up to actually run
184     threadSize = num_elements;
185 
186     error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize );
187     test_error( error, "Unable to get thread group max size" );
188 
189     if( matchGroupSize )
190         // HACK because xchg and cmpxchg apparently are limited by hardware
191         threadSize = groupSize;
192 
193     if( isLocal )
194     {
195         size_t maxSizes[3] = {0, 0, 0};
196         error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), maxSizes, 0);
197         test_error( error, "Unable to obtain max work item sizes for the device" );
198 
199         size_t workSize;
200         error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
201         test_error( error, "Unable to obtain max work group size for device and kernel combo" );
202 
203         // "workSize" is limited to that of the first dimension as only a 1DRange is executed.
204         if( maxSizes[0] < workSize )
205         {
206             workSize = maxSizes[0];
207         }
208 
209         threadSize = groupSize = workSize;
210     }
211 
212 
213     log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize );
214 
215     refValues = (cl_int *)malloc( typeSize * threadSize );
216 
217     if( testFns.GenerateRefsIntFn != NULL )
218     {
219         // We have a ref generator provided
220         d = init_genrand( gRandomSeed );
221         startRefValues = malloc( typeSize * threadSize );
222         if( typeSize == 4 )
223             testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d );
224         else
225             testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d );
226         free_mtdata(d);
227         d = NULL;
228     }
229     else
230         startRefValues = NULL;
231 
232     // If we're given a num_results function, we need to determine how many result objects we need. If
233     // we don't have it, we assume it's just 1
234     size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1;
235 
236     char * destItems = new char[ typeSize * numDestItems ];
237     if( destItems == NULL )
238     {
239         log_error( "ERROR: Unable to allocate memory!\n" );
240         return -1;
241     }
242     void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue;
243     for( size_t i = 0; i < numDestItems; i++ )
244         memcpy( destItems + i * typeSize, startValue, typeSize );
245 
246     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
247                                 typeSize * numDestItems, destItems, NULL);
248     if (!streams[0])
249     {
250         log_error("ERROR: Creating output array failed!\n");
251         return -1;
252     }
253     streams[1] = clCreateBuffer(
254         context,
255         ((startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
256         typeSize * threadSize, startRefValues, NULL);
257     if (!streams[1])
258     {
259         log_error("ERROR: Creating reference array failed!\n");
260         return -1;
261     }
262 
263     /* Set the arguments */
264     error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
265     test_error( error, "Unable to set indexed kernel arguments" );
266     error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
267     test_error( error, "Unable to set indexed kernel arguments" );
268 
269     if( isLocal )
270     {
271         error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL );
272         test_error( error, "Unable to set indexed local kernel argument" );
273 
274         cl_int numDestItemsInt = (cl_int)numDestItems;
275         error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt );
276         test_error( error, "Unable to set indexed kernel argument" );
277     }
278 
279     /* Run the kernel */
280     threads[0] = threadSize;
281     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL );
282     test_error( error, "Unable to execute test kernel" );
283 
284     error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
285     test_error( error, "Unable to read result value!" );
286 
287     error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL );
288     test_error( error, "Unable to read reference values!" );
289 
290     // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
291     // have one, it's because we're comparing ref values only
292     if( testFns.ExpectedValueIntFn != NULL )
293     {
294         for( size_t i = 0; i < numDestItems; i++ )
295         {
296             char expected[ 8 ];
297             cl_int intVal;
298             cl_long longVal;
299             if( typeSize == 4 )
300             {
301                 // Int version
302                 intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i );
303                 memcpy( expected, &intVal, sizeof( intVal ) );
304             }
305             else
306             {
307                 // Long version
308                 longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i );
309                 memcpy( expected, &longVal, sizeof( longVal ) );
310             }
311 
312             if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 )
313             {
314                 if( typeSize == 4 )
315                 {
316                     cl_int *outValue = (cl_int *)( destItems + i * typeSize );
317                     log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue );
318                     cl_int *startRefs = (cl_int *)startRefValues;
319                     cl_int *refs = (cl_int *)refValues;
320                     for( i = 0; i < threadSize; i++ )
321                     {
322                         if( startRefs != NULL )
323                             log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] );
324                         else
325                             log_info( " --- %ld --- %d\n", i, refs[i] );
326                     }
327                 }
328                 else
329                 {
330                     cl_long *outValue = (cl_long *)( destItems + i * typeSize );
331                     log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue );
332                     cl_long *startRefs = (cl_long *)startRefValues;
333                     cl_long *refs = (cl_long *)refValues;
334                     for( i = 0; i < threadSize; i++ )
335                     {
336                         if( startRefs != NULL )
337                             log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] );
338                         else
339                             log_info( " --- %ld --- %lld\n", i, refs[i] );
340                     }
341                 }
342                 return -1;
343             }
344         }
345     }
346 
347     if( testFns.VerifyRefsIntFn != NULL )
348     {
349         /* Use the verify function to also check the results */
350         if( dataType == kFloat )
351         {
352             cl_float *outValue = (cl_float *)destItems;
353             if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 )
354             {
355                 log_error( "ERROR: Reference values did not validate!\n" );
356                 return -1;
357             }
358         }
359         else if( typeSize == 4 )
360         {
361             cl_int *outValue = (cl_int *)destItems;
362             if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 )
363             {
364                 log_error( "ERROR: Reference values did not validate!\n" );
365                 return -1;
366             }
367         }
368         else
369         {
370             cl_long *outValue = (cl_long *)destItems;
371             if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 )
372             {
373                 log_error( "ERROR: Reference values did not validate!\n" );
374                 return -1;
375             }
376         }
377     }
378     else if( testFns.ExpectedValueIntFn == NULL )
379     {
380         log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" );
381         return -1;
382     }
383 
384 
385     /* Re-write the starting value */
386     for( size_t i = 0; i < numDestItems; i++ )
387         memcpy( destItems + i * typeSize, startValue, typeSize );
388     error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
389     test_error( error, "Unable to write starting values!" );
390 
391     /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
392     threads[0] = 1;
393     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL );
394     test_error( error, "Unable to execute test kernel" );
395 
396     error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL );
397     test_error( error, "Unable to read reference values!" );
398 
399     if( memcmp( refValues, destItems, typeSize ) != 0 )
400     {
401         if( typeSize == 4 )
402         {
403             cl_int *s = (cl_int *)destItems;
404             cl_int *r = (cl_int *)refValues;
405             log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
406                       " (should have been %d, returned %d)!\n", *s, *r );
407         }
408         else
409         {
410             cl_long *s = (cl_long *)destItems;
411             cl_long *r = (cl_long *)refValues;
412             log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
413                       " (should have been %lld, returned %lld)!\n", *s, *r );
414         }
415         return -1;
416     }
417 
418     delete [] destItems;
419     free( refValues );
420     if( startRefValues != NULL )
421         free( startRefValues );
422 
423     return 0;
424 }
425 
test_atomic_function_set(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool matchGroupSize,bool usingAtomicPrefix)426 int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
427                              TestFns testFns,
428                              bool extended, bool matchGroupSize, bool usingAtomicPrefix )
429 {
430     log_info("    Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_");
431 
432     int errors = 0;
433     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kInt, matchGroupSize );
434     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kUInt, matchGroupSize );
435     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kInt, matchGroupSize );
436     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kUInt, matchGroupSize );
437 
438     // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix.
439     // The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix.
440     if (!usingAtomicPrefix) {
441       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kLong, matchGroupSize );
442       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kULong, matchGroupSize );
443       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kLong, matchGroupSize );
444       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kULong, matchGroupSize );
445     }
446 
447     return errors;
448 }
449 
450 #pragma mark ---- add
451 
452 const char atom_add_core[] =
453 "    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
454 "    atom_add( &destMemory[0], tid + 3 );\n"
455 "   atom_add( &destMemory[0], tid + 3 );\n"
456 "   atom_add( &destMemory[0], tid + 3 );\n";
457 
458 const char atomic_add_core[] =
459 "    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
460 "    atomic_add( &destMemory[0], tid + 3 );\n"
461 "   atomic_add( &destMemory[0], tid + 3 );\n"
462 "   atomic_add( &destMemory[0], tid + 3 );\n";
463 
test_atomic_add_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)464 cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
465 {
466     cl_int total = 0;
467     for( size_t i = 0; i < size; i++ )
468         total += ( (cl_int)i + 3 ) * 4;
469     return total;
470 }
471 
test_atomic_add_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)472 cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
473 {
474     cl_long total = 0;
475     for( size_t i = 0; i < size; i++ )
476         total += ( ( i + 3 ) * 4 );
477     return total;
478 }
479 
test_atomic_add(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)480 int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
481 {
482     TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL };
483 
484     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
485         return -1;
486     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
487       return -1;
488     return 0;
489 }
490 
491 #pragma mark ---- sub
492 
493 const char atom_sub_core[] = "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
494 
495 const char atomic_sub_core[] = "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
496 
test_atomic_sub_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)497 cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
498 {
499     cl_int total = INT_TEST_VALUE;
500     for( size_t i = 0; i < size; i++ )
501         total -= (cl_int)i + 3;
502     return total;
503 }
504 
test_atomic_sub_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)505 cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
506 {
507     cl_long total = LONG_TEST_VALUE;
508     for( size_t i = 0; i < size; i++ )
509         total -= i + 3;
510     return total;
511 }
512 
test_atomic_sub(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)513 int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
514 {
515     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL };
516 
517     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
518         return -1;
519     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
520         return -1;
521     return 0;
522 }
523 
524 #pragma mark ---- xchg
525 
526 const char atom_xchg_core[] = "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
527 
528 const char atomic_xchg_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
529 const char atomic_xchg_float_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
530 
test_atomic_xchg_verify_int(size_t size,cl_int * refValues,cl_int finalValue)531 bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue )
532 {
533     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
534     char *valids;
535     size_t i;
536     char originalValidCount = 0;
537 
538     valids = (char *)malloc( sizeof( char ) * size );
539     memset( valids, 0, sizeof( char ) * size );
540 
541     for( i = 0; i < size; i++ )
542     {
543         if( refValues[ i ] == INT_TEST_VALUE )
544         {
545             // Special initial value
546             originalValidCount++;
547             continue;
548         }
549         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
550         {
551             log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] );
552             return false;
553         }
554         valids[ refValues[ i ] ] ++;
555     }
556 
557     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
558      the final value outputted */
559     if( valids[ finalValue ] > 0 )
560     {
561         log_error( "ERROR: Final value %d was also in ref list!\n", finalValue );
562         return false;
563     }
564     else
565         valids[ finalValue ] = 1;    // So the following loop will be okay
566 
567     /* Now check that every entry has one and only one count */
568     if( originalValidCount != 1 )
569     {
570         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
571         return false;
572     }
573     for( i = 0; i < size; i++ )
574     {
575         if( valids[ i ] != 1 )
576         {
577             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
578             for( size_t j = 0; j < size; j++ )
579                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
580             return false;
581         }
582     }
583 
584     free( valids );
585     return true;
586 }
587 
test_atomic_xchg_verify_long(size_t size,cl_long * refValues,cl_long finalValue)588 bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue )
589 {
590     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
591     char *valids;
592     size_t i;
593     char originalValidCount = 0;
594 
595     valids = (char *)malloc( sizeof( char ) * size );
596     memset( valids, 0, sizeof( char ) * size );
597 
598     for( i = 0; i < size; i++ )
599     {
600         if( refValues[ i ] == LONG_TEST_VALUE )
601         {
602             // Special initial value
603             originalValidCount++;
604             continue;
605         }
606         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
607         {
608             log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] );
609             return false;
610         }
611         valids[ refValues[ i ] ] ++;
612     }
613 
614     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
615      the final value outputted */
616     if( valids[ finalValue ] > 0 )
617     {
618         log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue );
619         return false;
620     }
621     else
622         valids[ finalValue ] = 1;    // So the following loop will be okay
623 
624     /* Now check that every entry has one and only one count */
625     if( originalValidCount != 1 )
626     {
627         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
628         return false;
629     }
630     for( i = 0; i < size; i++ )
631     {
632         if( valids[ i ] != 1 )
633         {
634             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
635             for( size_t j = 0; j < size; j++ )
636                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
637             return false;
638         }
639     }
640 
641     free( valids );
642     return true;
643 }
644 
test_atomic_xchg_verify_float(size_t size,cl_float * refValues,cl_float finalValue)645 bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue )
646 {
647     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
648     char *valids;
649     size_t i;
650     char originalValidCount = 0;
651 
652     valids = (char *)malloc( sizeof( char ) * size );
653     memset( valids, 0, sizeof( char ) * size );
654 
655     for( i = 0; i < size; i++ )
656     {
657         cl_int *intRefValue = (cl_int *)( &refValues[ i ] );
658         if( *intRefValue == INT_TEST_VALUE )
659         {
660             // Special initial value
661             originalValidCount++;
662             continue;
663         }
664         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
665         {
666             log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] );
667             return false;
668         }
669         valids[ (int)refValues[ i ] ] ++;
670     }
671 
672     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
673      the final value outputted */
674     if( valids[ (int)finalValue ] > 0 )
675     {
676         log_error( "ERROR: Final value %a was also in ref list!\n", finalValue );
677         return false;
678     }
679     else
680         valids[ (int)finalValue ] = 1;    // So the following loop will be okay
681 
682     /* Now check that every entry has one and only one count */
683     if( originalValidCount != 1 )
684     {
685         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
686         return false;
687     }
688     for( i = 0; i < size; i++ )
689     {
690         if( valids[ i ] != 1 )
691         {
692             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
693             for( size_t j = 0; j < size; j++ )
694                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
695             return false;
696         }
697     }
698 
699     free( valids );
700     return true;
701 }
702 
test_atomic_xchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)703 int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
704 {
705     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float };
706 
707     int errors = test_atomic_function_set( deviceID, context, queue, num_elements, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false  );
708     errors |= test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true  );
709 
710     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, false, kFloat, true );
711     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, true, kFloat, true );
712 
713     return errors;
714 }
715 
716 
717 #pragma mark ---- min
718 
719 const char atom_min_core[] = "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
720 
721 const char atomic_min_core[] = "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
722 
test_atomic_min_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)723 cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
724 {
725     cl_int total = 0x7fffffffL;
726     for( size_t i = 0; i < size; i++ )
727     {
728         if( startRefValues[ i ] < total )
729             total = startRefValues[ i ];
730     }
731     return total;
732 }
733 
test_atomic_min_gen_int(size_t size,cl_int * startRefValues,MTdata d)734 void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d )
735 {
736     for( size_t i = 0; i < size; i++ )
737         startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
738 }
739 
test_atomic_min_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)740 cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
741 {
742     cl_long total = 0x7fffffffffffffffLL;
743     for( size_t i = 0; i < size; i++ )
744     {
745         if( startRefValues[ i ] < total )
746             total = startRefValues[ i ];
747     }
748     return total;
749 }
750 
test_atomic_min_gen_long(size_t size,cl_long * startRefValues,MTdata d)751 void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d )
752 {
753     for( size_t i = 0; i < size; i++ )
754         startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
755 }
756 
test_atomic_min(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)757 int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
758 {
759     TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL };
760 
761     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
762         return -1;
763     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
764         return -1;
765     return 0;
766 }
767 
768 
769 #pragma mark ---- max
770 
771 const char atom_max_core[] = "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
772 
773 const char atomic_max_core[] = "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
774 
test_atomic_max_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)775 cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
776 {
777     cl_int total = 0;
778     for( size_t i = 0; i < size; i++ )
779     {
780         if( startRefValues[ i ] > total )
781             total = startRefValues[ i ];
782     }
783     return total;
784 }
785 
test_atomic_max_gen_int(size_t size,cl_int * startRefValues,MTdata d)786 void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d )
787 {
788     for( size_t i = 0; i < size; i++ )
789         startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
790 }
791 
test_atomic_max_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)792 cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
793 {
794     cl_long total = 0;
795     for( size_t i = 0; i < size; i++ )
796     {
797         if( startRefValues[ i ] > total )
798             total = startRefValues[ i ];
799     }
800     return total;
801 }
802 
test_atomic_max_gen_long(size_t size,cl_long * startRefValues,MTdata d)803 void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d )
804 {
805     for( size_t i = 0; i < size; i++ )
806         startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
807 }
808 
test_atomic_max(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)809 int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
810 {
811     TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL };
812 
813     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
814         return -1;
815     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
816       return -1;
817     return 0;
818 }
819 
820 
821 #pragma mark ---- inc
822 
823 const char atom_inc_core[] = "    oldValues[tid] = atom_inc( &destMemory[0] );\n";
824 
825 const char atomic_inc_core[] = "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";
826 
test_atomic_inc_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)827 cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
828 {
829     return INT_TEST_VALUE + (cl_int)size;
830 }
831 
test_atomic_inc_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)832 cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
833 {
834     return LONG_TEST_VALUE + size;
835 }
836 
test_atomic_inc(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)837 int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
838 {
839     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL };
840 
841     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
842         return -1;
843     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
844         return -1;
845     return 0;
846 }
847 
848 
849 #pragma mark ---- dec
850 
851 const char atom_dec_core[] = "    oldValues[tid] = atom_dec( &destMemory[0] );\n";
852 
853 const char atomic_dec_core[] = "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";
854 
test_atomic_dec_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)855 cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
856 {
857     return INT_TEST_VALUE - (cl_int)size;
858 }
859 
test_atomic_dec_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)860 cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
861 {
862     return LONG_TEST_VALUE - size;
863 }
864 
test_atomic_dec(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)865 int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
866 {
867     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL };
868 
869     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
870         return -1;
871     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
872         return -1;
873     return 0;
874 }
875 
876 
877 #pragma mark ---- cmpxchg
878 
879 /* We test cmpxchg by implementing (the long way) atom_add */
880 const char atom_cmpxchg_core[] =
881 "    int oldValue, origValue, newValue;\n"
882 "    do { \n"
883 "        origValue = destMemory[0];\n"
884 "        newValue = origValue + tid + 2;\n"
885 "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
886 "    } while( oldValue != origValue );\n"
887 "    oldValues[tid] = oldValue;\n"
888 ;
889 
890 const char atom_cmpxchg64_core[] =
891 "    long oldValue, origValue, newValue;\n"
892 "    do { \n"
893 "        origValue = destMemory[0];\n"
894 "        newValue = origValue + tid + 2;\n"
895 "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
896 "    } while( oldValue != origValue );\n"
897 "    oldValues[tid] = oldValue;\n"
898 ;
899 
900 const char atomic_cmpxchg_core[] =
901 "    int oldValue, origValue, newValue;\n"
902 "    do { \n"
903 "        origValue = destMemory[0];\n"
904 "        newValue = origValue + tid + 2;\n"
905 "        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
906 "    } while( oldValue != origValue );\n"
907 "    oldValues[tid] = oldValue;\n"
908 ;
909 
test_atomic_cmpxchg_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)910 cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
911 {
912     cl_int total = INT_TEST_VALUE;
913     for( size_t i = 0; i < size; i++ )
914         total += (cl_int)i + 2;
915     return total;
916 }
917 
test_atomic_cmpxchg_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)918 cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
919 {
920     cl_long total = LONG_TEST_VALUE;
921     for( size_t i = 0; i < size; i++ )
922         total += i + 2;
923     return total;
924 }
925 
test_atomic_cmpxchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)926 int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
927 {
928     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL };
929 
930     int errors = 0;
931 
932     log_info("    Testing atom_ functions...\n");
933     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kInt, true );
934     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kUInt, true );
935     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kInt, true );
936     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kUInt, true );
937 
938     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kLong, true );
939     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kULong, true );
940     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kLong, true );
941     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kULong, true );
942 
943     log_info("    Testing atomic_ functions...\n");
944     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kInt, true );
945     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kUInt, true );
946     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kInt, true );
947     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kUInt, true );
948 
949     if( errors )
950         return -1;
951 
952     return 0;
953 }
954 
955 #pragma mark -------- Bitwise functions
956 
test_bitwise_num_results(size_t threadCount,ExplicitType dataType)957 size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType )
958 {
959     size_t numBits = get_explicit_type_size( dataType ) * 8;
960 
961     return ( threadCount + numBits - 1 ) / numBits;
962 }
963 
964 #pragma mark ---- and
965 
966 const char atom_and_core[] =
967 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
968 "    int  whichResult = tid / numBits;\n"
969 "    int  bitIndex = tid - ( whichResult * numBits );\n"
970 "\n"
971 "    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
972 ;
973 
974 const char atomic_and_core[] =
975 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
976 "    int  whichResult = tid / numBits;\n"
977 "    int  bitIndex = tid - ( whichResult * numBits );\n"
978 "\n"
979 "    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
980 ;
981 
982 
test_atomic_and_result_int(size_t size,cl_int * startRefValues,size_t whichResult)983 cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
984 {
985     size_t numThreads = ( (size_t)size + 31 ) / 32;
986     if( whichResult < numThreads - 1 )
987         return 0;
988 
989     // Last item doesn't get and'ed on every bit, so we have to mask away
990     size_t numBits = (size_t)size - whichResult * 32;
991     cl_int bits = (cl_int)0xffffffffL;
992     for( size_t i = 0; i < numBits; i++ )
993         bits &= ~( 1 << i );
994 
995     return bits;
996 }
997 
test_atomic_and_result_long(size_t size,cl_long * startRefValues,size_t whichResult)998 cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
999 {
1000     size_t numThreads = ( (size_t)size + 63 ) / 64;
1001     if( whichResult < numThreads - 1 )
1002         return 0;
1003 
1004     // Last item doesn't get and'ed on every bit, so we have to mask away
1005     size_t numBits = (size_t)size - whichResult * 64;
1006     cl_long bits = (cl_long)0xffffffffffffffffLL;
1007     for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
1008 
1009     return bits;
1010 }
1011 
test_atomic_and(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1012 int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1013 {
1014     TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results,
1015         test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL };
1016 
1017     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1018         return -1;
1019     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1020         return -1;
1021     return 0;
1022 }
1023 
1024 
1025 #pragma mark ---- or
1026 
1027 const char atom_or_core[] =
1028 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1029 "    int  whichResult = tid / numBits;\n"
1030 "    int  bitIndex = tid - ( whichResult * numBits );\n"
1031 "\n"
1032 "    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
1033 ;
1034 
1035 const char atomic_or_core[] =
1036 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1037 "    int  whichResult = tid / numBits;\n"
1038 "    int  bitIndex = tid - ( whichResult * numBits );\n"
1039 "\n"
1040 "    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
1041 ;
1042 
test_atomic_or_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1043 cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
1044 {
1045     size_t numThreads = ( (size_t)size + 31 ) / 32;
1046     if( whichResult < numThreads - 1 )
1047         return 0xffffffff;
1048 
1049     // Last item doesn't get and'ed on every bit, so we have to mask away
1050     size_t numBits = (size_t)size - whichResult * 32;
1051     cl_int bits = 0;
1052     for( size_t i = 0; i < numBits; i++ )
1053         bits |= ( 1 << i );
1054 
1055     return bits;
1056 }
1057 
test_atomic_or_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1058 cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
1059 {
1060     size_t numThreads = ( (size_t)size + 63 ) / 64;
1061     if( whichResult < numThreads - 1 )
1062         return 0x0ffffffffffffffffLL;
1063 
1064     // Last item doesn't get and'ed on every bit, so we have to mask away
1065     size_t numBits = (size_t)size - whichResult * 64;
1066     cl_long bits = 0;
1067     for( size_t i = 0; i < numBits; i++ )
1068         bits |= ( 1LL << i );
1069 
1070     return bits;
1071 }
1072 
test_atomic_or(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1073 int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1074 {
1075     TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL };
1076 
1077     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1078         return -1;
1079     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1080         return -1;
1081     return 0;
1082 }
1083 
1084 
1085 #pragma mark ---- xor
1086 
1087 const char atom_xor_core[] =
1088     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1089     "    int  bitIndex = tid & ( numBits - 1 );\n"
1090     "\n"
1091     "    oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
1092 
1093 const char atomic_xor_core[] =
1094     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1095     "    int  bitIndex = tid & ( numBits - 1 );\n"
1096     "\n"
1097     "    oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
1098 
test_atomic_xor_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1099 cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
1100 {
1101     cl_int total = 0x2f08ab41;
1102     for( size_t i = 0; i < size; i++ )
1103         total ^= ( 1 << ( i & 31 ) );
1104     return total;
1105 }
1106 
test_atomic_xor_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1107 cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
1108 {
1109     cl_long total = 0x2f08ab418ba0541LL;
1110     for( size_t i = 0; i < size; i++ )
1111         total ^= ( 1LL << ( i & 63 ) );
1112     return total;
1113 }
1114 
test_atomic_xor(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1115 int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1116 {
1117     TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL };
1118 
1119     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1120         return -1;
1121     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1122         return -1;
1123     return 0;
1124 }
1125 
1126 
1127 
1128 
1129