1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
24                        cl_kernel *k, cl_program *p, bool relaxedMode)
25 {
26     const char *c[] = { "__kernel void math_kernel",
27                         sizeNames[vectorSize],
28                         "( __global int",
29                         sizeNames[vectorSize],
30                         "* out, __global float",
31                         sizeNames[vectorSize],
32                         "* in1, __global float",
33                         sizeNames[vectorSize],
34                         "* in2 )\n"
35                         "{\n"
36                         "   size_t i = get_global_id(0);\n"
37                         "   out[i] = ",
38                         name,
39                         "( in1[i], in2[i] );\n"
40                         "}\n" };
41 
42     const char *c3[] = {
43         "__kernel void math_kernel",
44         sizeNames[vectorSize],
45         "( __global int* out, __global float* in, __global float* in2)\n"
46         "{\n"
47         "   size_t i = get_global_id(0);\n"
48         "   if( i + 1 < get_global_size(0) )\n"
49         "   {\n"
50         "       float3 f0 = vload3( 0, in + 3 * i );\n"
51         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
52         "       int3 i0 = ",
53         name,
54         "( f0, f1 );\n"
55         "       vstore3( i0, 0, out + 3*i );\n"
56         "   }\n"
57         "   else\n"
58         "   {\n"
59         "       size_t parity = i & 1;   // Figure out how many elements are "
60         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
61         "buffer size \n"
62         "       float3 f0;\n"
63         "       float3 f1;\n"
64         "       switch( parity )\n"
65         "       {\n"
66         "           case 1:\n"
67         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
68         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
69         "               break;\n"
70         "           case 0:\n"
71         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
72         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
73         "               break;\n"
74         "       }\n"
75         "       int3 i0 = ",
76         name,
77         "( f0, f1 );\n"
78         "       switch( parity )\n"
79         "       {\n"
80         "           case 0:\n"
81         "               out[3*i+1] = i0.y; \n"
82         "               // fall through\n"
83         "           case 1:\n"
84         "               out[3*i] = i0.x; \n"
85         "               break;\n"
86         "       }\n"
87         "   }\n"
88         "}\n"
89     };
90 
91     const char **kern = c;
92     size_t kernSize = sizeof(c) / sizeof(c[0]);
93 
94     if (sizeValues[vectorSize] == 3)
95     {
96         kern = c3;
97         kernSize = sizeof(c3) / sizeof(c3[0]);
98     }
99 
100     char testName[32];
101     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
102              sizeNames[vectorSize]);
103 
104     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
105                        relaxedMode);
106 }
107 
108 typedef struct BuildKernelInfo
109 {
110     cl_uint offset; // the first vector size to build
111     cl_uint kernel_count;
112     cl_kernel **kernels;
113     cl_program *programs;
114     const char *nameInCode;
115     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
116 } BuildKernelInfo;
117 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)118 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
119 {
120     BuildKernelInfo *info = (BuildKernelInfo *)p;
121     cl_uint i = info->offset + job_id;
122     return BuildKernel(info->nameInCode, i, info->kernel_count,
123                        info->kernels[i], info->programs + i, info->relaxedMode);
124 }
125 
126 // Thread specific data for a worker thread
127 typedef struct ThreadInfo
128 {
129     cl_mem inBuf; // input buffer for the thread
130     cl_mem inBuf2; // input buffer for the thread
131     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
132     MTdata d;
133     cl_command_queue tQueue; // per thread command queue to improve performance
134 } ThreadInfo;
135 
136 typedef struct TestInfo
137 {
138     size_t subBufferSize; // Size of the sub-buffer in elements
139     const Func *f; // A pointer to the function info
140     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
141     cl_kernel
142         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
143                                // worker thread:  k[vector_size][thread_id]
144     ThreadInfo *
145         tinfo; // An array of thread specific information for each worker thread
146     cl_uint threadCount; // Number of worker threads
147     cl_uint jobCount; // Number of jobs
148     cl_uint step; // step between each chunk and the next.
149     cl_uint scale; // stride between individual test values
150     int ftz; // non-zero if running in flush to zero mode
151 
152 } TestInfo;
153 
154 // A table of more difficult cases to get right
155 static const float specialValues[] = {
156     -NAN,
157     -INFINITY,
158     -FLT_MAX,
159     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
160     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
161     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
162     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
163     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
164     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
165     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
166     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
167     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
168     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
169     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
170     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
171     -1000.f,
172     -100.f,
173     -4.0f,
174     -3.5f,
175     -3.0f,
176     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
177     -2.5f,
178     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
179     -2.0f,
180     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
181     -1.5f,
182     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
183     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
184     -1.0f,
185     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
186     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
187     -0.5f,
188     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
189     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
190     -0.25f,
191     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
192     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
193     -FLT_MIN,
194     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
195     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
196     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
197     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
198     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
199     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
200     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
201     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
202     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
203     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
204     -0.0f,
205 
206     +NAN,
207     +INFINITY,
208     +FLT_MAX,
209     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
210     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
211     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
212     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
213     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
214     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
215     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
216     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
217     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
218     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
219     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
220     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
221     +1000.f,
222     +100.f,
223     +4.0f,
224     +3.5f,
225     +3.0f,
226     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
227     2.5f,
228     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
229     +2.0f,
230     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
231     1.5f,
232     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
233     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
234     +1.0f,
235     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
236     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
237     +0.5f,
238     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
239     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
240     +0.25f,
241     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
242     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
243     +FLT_MIN,
244     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
245     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
246     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
247     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
248     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
249     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
250     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
251     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
252     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
253     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
254     +0.0f,
255 };
256 
257 static const size_t specialValuesCount =
258     sizeof(specialValues) / sizeof(specialValues[0]);
259 
260 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
261 
TestMacro_Int_Float_Float(const Func * f,MTdata d,bool relaxedMode)262 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
263 {
264     TestInfo test_info;
265     cl_int error;
266 
267     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
268 
269     // Init test_info
270     memset(&test_info, 0, sizeof(test_info));
271     test_info.threadCount = GetThreadCount();
272     test_info.subBufferSize = BUFFER_SIZE
273         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
274     test_info.scale = getTestScale(sizeof(cl_float));
275 
276     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
277     if (test_info.step / test_info.subBufferSize != test_info.scale)
278     {
279         // there was overflow
280         test_info.jobCount = 1;
281     }
282     else
283     {
284         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
285     }
286 
287     test_info.f = f;
288     test_info.ftz =
289         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
290 
291     // cl_kernels aren't thread safe, so we make one for each vector size for
292     // every thread
293     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
294     {
295         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
296         test_info.k[i] = (cl_kernel *)malloc(array_size);
297         if (NULL == test_info.k[i])
298         {
299             vlog_error("Error: Unable to allocate storage for kernels!\n");
300             error = CL_OUT_OF_HOST_MEMORY;
301             goto exit;
302         }
303         memset(test_info.k[i], 0, array_size);
304     }
305     test_info.tinfo =
306         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
307     if (NULL == test_info.tinfo)
308     {
309         vlog_error(
310             "Error: Unable to allocate storage for thread specific data.\n");
311         error = CL_OUT_OF_HOST_MEMORY;
312         goto exit;
313     }
314     memset(test_info.tinfo, 0,
315            test_info.threadCount * sizeof(*test_info.tinfo));
316     for (cl_uint i = 0; i < test_info.threadCount; i++)
317     {
318         cl_buffer_region region = {
319             i * test_info.subBufferSize * sizeof(cl_float),
320             test_info.subBufferSize * sizeof(cl_float)
321         };
322         test_info.tinfo[i].inBuf =
323             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
324                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
325         if (error || NULL == test_info.tinfo[i].inBuf)
326         {
327             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
328                        "region {%zd, %zd}\n",
329                        region.origin, region.size);
330             goto exit;
331         }
332         test_info.tinfo[i].inBuf2 =
333             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
334                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
335         if (error || NULL == test_info.tinfo[i].inBuf2)
336         {
337             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
338                        "region {%zd, %zd}\n",
339                        region.origin, region.size);
340             goto exit;
341         }
342 
343         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
344         {
345             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
346                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
347                 &region, &error);
348             if (error || NULL == test_info.tinfo[i].outBuf[j])
349             {
350                 vlog_error("Error: Unable to create sub-buffer of "
351                            "gOutBuffer[%d] for region {%zd, %zd}\n",
352                            (int)j, region.origin, region.size);
353                 goto exit;
354             }
355         }
356         test_info.tinfo[i].tQueue =
357             clCreateCommandQueue(gContext, gDevice, 0, &error);
358         if (NULL == test_info.tinfo[i].tQueue || error)
359         {
360             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
361             goto exit;
362         }
363 
364         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
365     }
366 
367     // Init the kernels
368     {
369         BuildKernelInfo build_info = {
370             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
371             test_info.programs,  f->nameInCode,         relaxedMode
372         };
373         if ((error = ThreadPool_Do(BuildKernelFn,
374                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
375                                    &build_info)))
376             goto exit;
377     }
378 
379     // Run the kernels
380     if (!gSkipCorrectnessTesting)
381     {
382         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
383 
384         if (error) goto exit;
385 
386         if (gWimpyMode)
387             vlog("Wimp pass");
388         else
389             vlog("passed");
390     }
391 
392     vlog("\n");
393 
394 exit:
395     // Release
396     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
397     {
398         clReleaseProgram(test_info.programs[i]);
399         if (test_info.k[i])
400         {
401             for (cl_uint j = 0; j < test_info.threadCount; j++)
402                 clReleaseKernel(test_info.k[i][j]);
403 
404             free(test_info.k[i]);
405         }
406     }
407     if (test_info.tinfo)
408     {
409         for (cl_uint i = 0; i < test_info.threadCount; i++)
410         {
411             free_mtdata(test_info.tinfo[i].d);
412             clReleaseMemObject(test_info.tinfo[i].inBuf);
413             clReleaseMemObject(test_info.tinfo[i].inBuf2);
414             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
415                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
416             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
417         }
418 
419         free(test_info.tinfo);
420     }
421 
422     return error;
423 }
424 
Test(cl_uint job_id,cl_uint thread_id,void * data)425 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
426 {
427     const TestInfo *job = (const TestInfo *)data;
428     size_t buffer_elements = job->subBufferSize;
429     size_t buffer_size = buffer_elements * sizeof(cl_float);
430     cl_uint base = job_id * (cl_uint)job->step;
431     ThreadInfo *tinfo = job->tinfo + thread_id;
432     fptr func = job->f->func;
433     int ftz = job->ftz;
434     MTdata d = tinfo->d;
435     cl_int error;
436     const char *name = job->f->name;
437     cl_int *t = 0;
438     cl_int *r = 0;
439     cl_float *s = 0;
440     cl_float *s2 = 0;
441 
442     // start the map of the output arrays
443     cl_event e[VECTOR_SIZE_COUNT];
444     cl_int *out[VECTOR_SIZE_COUNT];
445     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
446     {
447         out[j] = (cl_int *)clEnqueueMapBuffer(
448             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
449             buffer_size, 0, NULL, e + j, &error);
450         if (error || NULL == out[j])
451         {
452             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
453                        error);
454             return error;
455         }
456     }
457 
458     // Get that moving
459     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
460 
461     // Init input array
462     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
463     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
464     cl_uint idx = 0;
465 
466     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
467     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
468 
469     if (job_id <= (cl_uint)lastSpecialJobIndex)
470     { // test edge cases
471         float *fp = (float *)p;
472         float *fp2 = (float *)p2;
473         uint32_t x, y;
474 
475         x = (job_id * buffer_elements) % specialValuesCount;
476         y = (job_id * buffer_elements) / specialValuesCount;
477 
478         for (; idx < buffer_elements; idx++)
479         {
480             fp[idx] = specialValues[x];
481             fp2[idx] = specialValues[y];
482             ++x;
483             if (x >= specialValuesCount)
484             {
485                 x = 0;
486                 y++;
487                 if (y >= specialValuesCount) break;
488             }
489         }
490     }
491 
492     // Init any remaining values.
493     for (; idx < buffer_elements; idx++)
494     {
495         p[idx] = genrand_int32(d);
496         p2[idx] = genrand_int32(d);
497     }
498 
499     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
500                                       buffer_size, p, 0, NULL, NULL)))
501     {
502         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
503         goto exit;
504     }
505 
506     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
507                                       buffer_size, p2, 0, NULL, NULL)))
508     {
509         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
510         goto exit;
511     }
512 
513     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
514     {
515         // Wait for the map to finish
516         if ((error = clWaitForEvents(1, e + j)))
517         {
518             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
519             goto exit;
520         }
521         if ((error = clReleaseEvent(e[j])))
522         {
523             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
524             goto exit;
525         }
526 
527         // Fill the result buffer with garbage, so that old results don't carry
528         // over
529         uint32_t pattern = 0xffffdead;
530         memset_pattern4(out[j], &pattern, buffer_size);
531         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
532                                              out[j], 0, NULL, NULL)))
533         {
534             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
535             goto exit;
536         }
537 
538         // run the kernel
539         size_t vectorCount =
540             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
541         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
542                                                  // own copy of the cl_kernel
543         cl_program program = job->programs[j];
544 
545         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
546                                     &tinfo->outBuf[j])))
547         {
548             LogBuildError(program);
549             return error;
550         }
551         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
552                                     &tinfo->inBuf)))
553         {
554             LogBuildError(program);
555             return error;
556         }
557         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
558                                     &tinfo->inBuf2)))
559         {
560             LogBuildError(program);
561             return error;
562         }
563 
564         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
565                                             &vectorCount, NULL, 0, NULL, NULL)))
566         {
567             vlog_error("FAILED -- could not execute kernel\n");
568             goto exit;
569         }
570     }
571 
572     // Get that moving
573     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
574 
575     if (gSkipCorrectnessTesting) return CL_SUCCESS;
576 
577     // Calculate the correctly rounded reference result
578     r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
579     s = (float *)gIn + thread_id * buffer_elements;
580     s2 = (float *)gIn2 + thread_id * buffer_elements;
581     for (size_t j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
582 
583     // Read the data back -- no need to wait for the first N-1 buffers but wait
584     // for the last buffer. This is an in order queue.
585     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
586     {
587         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
588         out[j] = (cl_int *)clEnqueueMapBuffer(
589             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
590             buffer_size, 0, NULL, NULL, &error);
591         if (error || NULL == out[j])
592         {
593             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
594                        error);
595             goto exit;
596         }
597     }
598 
599     // Verify data
600     t = (cl_int *)r;
601     for (size_t j = 0; j < buffer_elements; j++)
602     {
603         cl_int *q = out[0];
604 
605         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
606         {
607             if (ftz)
608             {
609                 if (IsFloatSubnormal(s[j]))
610                 {
611                     if (IsFloatSubnormal(s2[j]))
612                     {
613                         int correct = func.i_ff(0.0f, 0.0f);
614                         int correct2 = func.i_ff(0.0f, -0.0f);
615                         int correct3 = func.i_ff(-0.0f, 0.0f);
616                         int correct4 = func.i_ff(-0.0f, -0.0f);
617 
618                         if (correct == q[j] || correct2 == q[j]
619                             || correct3 == q[j] || correct4 == q[j])
620                             continue;
621                     }
622                     else
623                     {
624                         int correct = func.i_ff(0.0f, s2[j]);
625                         int correct2 = func.i_ff(-0.0f, s2[j]);
626                         if (correct == q[j] || correct2 == q[j]) continue;
627                     }
628                 }
629                 else if (IsFloatSubnormal(s2[j]))
630                 {
631                     int correct = func.i_ff(s[j], 0.0f);
632                     int correct2 = func.i_ff(s[j], -0.0f);
633                     if (correct == q[j] || correct2 == q[j]) continue;
634                 }
635             }
636 
637             uint32_t err = t[j] - q[j];
638             if (q[j] > t[j]) err = q[j] - t[j];
639             vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
640                        "0x%8.8x (index: %d)\n",
641                        name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
642                        j);
643             error = -1;
644             goto exit;
645         }
646 
647         for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
648         {
649             q = out[k];
650             // If we aren't getting the correctly rounded result
651             if (-t[j] != q[j])
652             {
653                 if (ftz)
654                 {
655                     if (IsFloatSubnormal(s[j]))
656                     {
657                         if (IsFloatSubnormal(s2[j]))
658                         {
659                             int correct = -func.i_ff(0.0f, 0.0f);
660                             int correct2 = -func.i_ff(0.0f, -0.0f);
661                             int correct3 = -func.i_ff(-0.0f, 0.0f);
662                             int correct4 = -func.i_ff(-0.0f, -0.0f);
663 
664                             if (correct == q[j] || correct2 == q[j]
665                                 || correct3 == q[j] || correct4 == q[j])
666                                 continue;
667                         }
668                         else
669                         {
670                             int correct = -func.i_ff(0.0f, s2[j]);
671                             int correct2 = -func.i_ff(-0.0f, s2[j]);
672                             if (correct == q[j] || correct2 == q[j]) continue;
673                         }
674                     }
675                     else if (IsFloatSubnormal(s2[j]))
676                     {
677                         int correct = -func.i_ff(s[j], 0.0f);
678                         int correct2 = -func.i_ff(s[j], -0.0f);
679                         if (correct == q[j] || correct2 == q[j]) continue;
680                     }
681                 }
682                 cl_uint err = -t[j] - q[j];
683                 if (q[j] > -t[j]) err = q[j] + t[j];
684                 vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
685                            "vs. 0x%8.8x (index: %d)\n",
686                            name, sizeNames[k], err, ((float *)s)[j],
687                            ((float *)s2)[j], -t[j], q[j], j);
688                 error = -1;
689                 goto exit;
690             }
691         }
692     }
693 
694     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
695     {
696         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
697                                              out[j], 0, NULL, NULL)))
698         {
699             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
700                        j, error);
701             return error;
702         }
703     }
704 
705     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
706 
707 
708     if (0 == (base & 0x0fffffff))
709     {
710         if (gVerboseBruteForce)
711         {
712             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
713                  "ThreadCount:%2u\n",
714                  base, job->step, job->scale, buffer_elements,
715                  job->threadCount);
716         }
717         else
718         {
719             vlog(".");
720         }
721         fflush(stdout);
722     }
723 
724 exit:
725     return error;
726 }
727