1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <climits>
22 #include <cstring>
23 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)24 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
25                        cl_kernel *k, cl_program *p, bool relaxedMode)
26 {
27     const char *c[] = { "__kernel void math_kernel",
28                         sizeNames[vectorSize],
29                         "( __global float",
30                         sizeNames[vectorSize],
31                         "* out, __global float",
32                         sizeNames[vectorSize],
33                         "* in1, __global int",
34                         sizeNames[vectorSize],
35                         "* in2 )\n"
36                         "{\n"
37                         "   size_t i = get_global_id(0);\n"
38                         "   out[i] = ",
39                         name,
40                         "( in1[i], in2[i] );\n"
41                         "}\n" };
42 
43     const char *c3[] = {
44         "__kernel void math_kernel",
45         sizeNames[vectorSize],
46         "( __global float* out, __global float* in, __global int* in2)\n"
47         "{\n"
48         "   size_t i = get_global_id(0);\n"
49         "   if( i + 1 < get_global_size(0) )\n"
50         "   {\n"
51         "       float3 f0 = vload3( 0, in + 3 * i );\n"
52         "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
53         "       f0 = ",
54         name,
55         "( f0, i0 );\n"
56         "       vstore3( f0, 0, out + 3*i );\n"
57         "   }\n"
58         "   else\n"
59         "   {\n"
60         "       size_t parity = i & 1;   // Figure out how many elements are "
61         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
62         "buffer size \n"
63         "       float3 f0;\n"
64         "       int3 i0;\n"
65         "       switch( parity )\n"
66         "       {\n"
67         "           case 1:\n"
68         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
69         "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
70         "               break;\n"
71         "           case 0:\n"
72         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
73         "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
74         "               break;\n"
75         "       }\n"
76         "       f0 = ",
77         name,
78         "( f0, i0 );\n"
79         "       switch( parity )\n"
80         "       {\n"
81         "           case 0:\n"
82         "               out[3*i+1] = f0.y; \n"
83         "               // fall through\n"
84         "           case 1:\n"
85         "               out[3*i] = f0.x; \n"
86         "               break;\n"
87         "       }\n"
88         "   }\n"
89         "}\n"
90     };
91 
92     const char **kern = c;
93     size_t kernSize = sizeof(c) / sizeof(c[0]);
94 
95     if (sizeValues[vectorSize] == 3)
96     {
97         kern = c3;
98         kernSize = sizeof(c3) / sizeof(c3[0]);
99     }
100 
101     char testName[32];
102     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
103              sizeNames[vectorSize]);
104 
105     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
106                        relaxedMode);
107 }
108 
109 typedef struct BuildKernelInfo
110 {
111     cl_uint offset; // the first vector size to build
112     cl_uint kernel_count;
113     cl_kernel **kernels;
114     cl_program *programs;
115     const char *nameInCode;
116     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
117 } BuildKernelInfo;
118 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)119 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
120 {
121     BuildKernelInfo *info = (BuildKernelInfo *)p;
122     cl_uint i = info->offset + job_id;
123     return BuildKernel(info->nameInCode, i, info->kernel_count,
124                        info->kernels[i], info->programs + i, info->relaxedMode);
125 }
126 
127 // Thread specific data for a worker thread
128 typedef struct ThreadInfo
129 {
130     cl_mem inBuf; // input buffer for the thread
131     cl_mem inBuf2; // input buffer for the thread
132     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
133     float maxError; // max error value. Init to 0.
134     double
135         maxErrorValue; // position of the max error value (param 1).  Init to 0.
136     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
137                            // to 0.
138     MTdata d;
139     cl_command_queue tQueue; // per thread command queue to improve performance
140 } ThreadInfo;
141 
142 typedef struct TestInfo
143 {
144     size_t subBufferSize; // Size of the sub-buffer in elements
145     const Func *f; // A pointer to the function info
146     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
147     cl_kernel
148         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
149                                // worker thread:  k[vector_size][thread_id]
150     ThreadInfo *
151         tinfo; // An array of thread specific information for each worker thread
152     cl_uint threadCount; // Number of worker threads
153     cl_uint jobCount; // Number of jobs
154     cl_uint step; // step between each chunk and the next.
155     cl_uint scale; // stride between individual test values
156     float ulps; // max_allowed ulps
157     int ftz; // non-zero if running in flush to zero mode
158 
159     // no special values
160 } TestInfo;
161 
162 // A table of more difficult cases to get right
163 static const float specialValues[] = {
164     -NAN,
165     -INFINITY,
166     -FLT_MAX,
167     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
168     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
169     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
170     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
171     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
172     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
173     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
174     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
175     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
176     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
177     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
178     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
179     -1000.f,
180     -100.f,
181     -4.0f,
182     -3.5f,
183     -3.0f,
184     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
185     -2.5f,
186     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
187     -2.0f,
188     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
189     -1.5f,
190     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
191     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
192     -1.0f,
193     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
194     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
195     -0.5f,
196     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
197     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
198     -0.25f,
199     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
200     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
201     -FLT_MIN,
202     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
203     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
204     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
205     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
206     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
207     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
208     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
209     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
210     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
211     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
212     -0.0f,
213 
214     +NAN,
215     +INFINITY,
216     +FLT_MAX,
217     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
218     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
219     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
220     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
221     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
222     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
223     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
224     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
225     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
226     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
227     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
228     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
229     +1000.f,
230     +100.f,
231     +4.0f,
232     +3.5f,
233     +3.0f,
234     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
235     2.5f,
236     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
237     +2.0f,
238     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
239     1.5f,
240     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
241     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
242     +1.0f,
243     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
244     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
245     +0.5f,
246     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
247     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
248     +0.25f,
249     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
250     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
251     +FLT_MIN,
252     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
253     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
254     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
255     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
256     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
257     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
258     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
259     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
260     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
261     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
262     +0.0f,
263 };
264 
265 static const size_t specialValuesCount =
266     sizeof(specialValues) / sizeof(specialValues[0]);
267 
268 static const int specialValuesInt[] = {
269     0,           1,           2,           3,          126,        127,
270     128,         0x02000001,  0x04000001,  1465264071, 1488522147, -1,
271     -2,          -3,          -126,        -127,       -128,       -0x02000001,
272     -0x04000001, -1465264071, -1488522147,
273 };
274 static size_t specialValuesIntCount =
275     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
276 
277 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
278 
TestFunc_Float_Float_Int(const Func * f,MTdata d,bool relaxedMode)279 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
280 {
281     TestInfo test_info;
282     cl_int error;
283     float maxError = 0.0f;
284     double maxErrorVal = 0.0;
285     cl_int maxErrorVal2 = 0;
286 
287     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
288 
289     // Init test_info
290     memset(&test_info, 0, sizeof(test_info));
291     test_info.threadCount = GetThreadCount();
292     test_info.subBufferSize = BUFFER_SIZE
293         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
294     test_info.scale = getTestScale(sizeof(cl_float));
295 
296     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
297     if (test_info.step / test_info.subBufferSize != test_info.scale)
298     {
299         // there was overflow
300         test_info.jobCount = 1;
301     }
302     else
303     {
304         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
305     }
306 
307     test_info.f = f;
308     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
309     test_info.ftz =
310         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
311 
312     // cl_kernels aren't thread safe, so we make one for each vector size for
313     // every thread
314     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
315     {
316         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
317         test_info.k[i] = (cl_kernel *)malloc(array_size);
318         if (NULL == test_info.k[i])
319         {
320             vlog_error("Error: Unable to allocate storage for kernels!\n");
321             error = CL_OUT_OF_HOST_MEMORY;
322             goto exit;
323         }
324         memset(test_info.k[i], 0, array_size);
325     }
326     test_info.tinfo =
327         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
328     if (NULL == test_info.tinfo)
329     {
330         vlog_error(
331             "Error: Unable to allocate storage for thread specific data.\n");
332         error = CL_OUT_OF_HOST_MEMORY;
333         goto exit;
334     }
335     memset(test_info.tinfo, 0,
336            test_info.threadCount * sizeof(*test_info.tinfo));
337     for (cl_uint i = 0; i < test_info.threadCount; i++)
338     {
339         cl_buffer_region region = {
340             i * test_info.subBufferSize * sizeof(cl_float),
341             test_info.subBufferSize * sizeof(cl_float)
342         };
343         test_info.tinfo[i].inBuf =
344             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
345                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
346         if (error || NULL == test_info.tinfo[i].inBuf)
347         {
348             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
349                        "region {%zd, %zd}\n",
350                        region.origin, region.size);
351             goto exit;
352         }
353         cl_buffer_region region2 = { i * test_info.subBufferSize
354                                          * sizeof(cl_int),
355                                      test_info.subBufferSize * sizeof(cl_int) };
356         test_info.tinfo[i].inBuf2 =
357             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
358                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
359         if (error || NULL == test_info.tinfo[i].inBuf2)
360         {
361             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
362                        "region {%zd, %zd}\n",
363                        region.origin, region.size);
364             goto exit;
365         }
366 
367         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
368         {
369             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
370                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
371                 &region, &error);
372             if (error || NULL == test_info.tinfo[i].outBuf[j])
373             {
374                 vlog_error("Error: Unable to create sub-buffer of "
375                            "gOutBuffer[%d] for region {%zd, %zd}\n",
376                            (int)j, region.origin, region.size);
377                 goto exit;
378             }
379         }
380         test_info.tinfo[i].tQueue =
381             clCreateCommandQueue(gContext, gDevice, 0, &error);
382         if (NULL == test_info.tinfo[i].tQueue || error)
383         {
384             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
385             goto exit;
386         }
387 
388         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
389     }
390 
391     // Init the kernels
392     {
393         BuildKernelInfo build_info = {
394             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
395             test_info.programs,  f->nameInCode,         relaxedMode
396         };
397         if ((error = ThreadPool_Do(BuildKernelFn,
398                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
399                                    &build_info)))
400             goto exit;
401     }
402 
403     // Run the kernels
404     if (!gSkipCorrectnessTesting)
405     {
406         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
407 
408         // Accumulate the arithmetic errors
409         for (cl_uint i = 0; i < test_info.threadCount; i++)
410         {
411             if (test_info.tinfo[i].maxError > maxError)
412             {
413                 maxError = test_info.tinfo[i].maxError;
414                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
415                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
416             }
417         }
418 
419         if (error) goto exit;
420 
421         if (gWimpyMode)
422             vlog("Wimp pass");
423         else
424             vlog("passed");
425 
426         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
427     }
428 
429     vlog("\n");
430 
431 exit:
432     // Release
433     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
434     {
435         clReleaseProgram(test_info.programs[i]);
436         if (test_info.k[i])
437         {
438             for (cl_uint j = 0; j < test_info.threadCount; j++)
439                 clReleaseKernel(test_info.k[i][j]);
440 
441             free(test_info.k[i]);
442         }
443     }
444     if (test_info.tinfo)
445     {
446         for (cl_uint i = 0; i < test_info.threadCount; i++)
447         {
448             free_mtdata(test_info.tinfo[i].d);
449             clReleaseMemObject(test_info.tinfo[i].inBuf);
450             clReleaseMemObject(test_info.tinfo[i].inBuf2);
451             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
452                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
453             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
454         }
455 
456         free(test_info.tinfo);
457     }
458 
459     return error;
460 }
461 
Test(cl_uint job_id,cl_uint thread_id,void * data)462 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
463 {
464     const TestInfo *job = (const TestInfo *)data;
465     size_t buffer_elements = job->subBufferSize;
466     size_t buffer_size = buffer_elements * sizeof(cl_float);
467     cl_uint base = job_id * (cl_uint)job->step;
468     ThreadInfo *tinfo = job->tinfo + thread_id;
469     fptr func = job->f->func;
470     int ftz = job->ftz;
471     float ulps = job->ulps;
472     MTdata d = tinfo->d;
473     cl_int error;
474     const char *name = job->f->name;
475     cl_uint *t = 0;
476     cl_float *r = 0;
477     cl_float *s = 0;
478     cl_int *s2 = 0;
479 
480     // start the map of the output arrays
481     cl_event e[VECTOR_SIZE_COUNT];
482     cl_uint *out[VECTOR_SIZE_COUNT];
483     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
484     {
485         out[j] = (cl_uint *)clEnqueueMapBuffer(
486             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
487             buffer_size, 0, NULL, e + j, &error);
488         if (error || NULL == out[j])
489         {
490             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
491                        error);
492             return error;
493         }
494     }
495 
496     // Get that moving
497     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
498 
499     // Init input array
500     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
501     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
502     size_t idx = 0;
503     int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
504     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
505 
506     if (job_id <= (cl_uint)lastSpecialJobIndex)
507     { // test edge cases
508         float *fp = (float *)p;
509         cl_int *ip2 = (cl_int *)p2;
510         uint32_t x, y;
511 
512         x = (job_id * buffer_elements) % specialValuesCount;
513         y = (job_id * buffer_elements) / specialValuesCount;
514 
515         for (; idx < buffer_elements; idx++)
516         {
517             fp[idx] = specialValues[x];
518             ip2[idx] = specialValuesInt[y];
519             ++x;
520             if (x >= specialValuesCount)
521             {
522                 x = 0;
523                 y++;
524                 if (y >= specialValuesIntCount) break;
525             }
526         }
527     }
528 
529     // Init any remaining values.
530     for (; idx < buffer_elements; idx++)
531     {
532         p[idx] = genrand_int32(d);
533         p2[idx] = genrand_int32(d);
534     }
535 
536     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
537                                       buffer_size, p, 0, NULL, NULL)))
538     {
539         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
540         goto exit;
541     }
542 
543     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
544                                       buffer_size, p2, 0, NULL, NULL)))
545     {
546         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
547         goto exit;
548     }
549 
550     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
551     {
552         // Wait for the map to finish
553         if ((error = clWaitForEvents(1, e + j)))
554         {
555             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
556             goto exit;
557         }
558         if ((error = clReleaseEvent(e[j])))
559         {
560             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
561             goto exit;
562         }
563 
564         // Fill the result buffer with garbage, so that old results don't carry
565         // over
566         uint32_t pattern = 0xffffdead;
567         memset_pattern4(out[j], &pattern, buffer_size);
568         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
569                                              out[j], 0, NULL, NULL)))
570         {
571             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
572             goto exit;
573         }
574 
575         // run the kernel
576         size_t vectorCount =
577             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
578         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
579                                                  // own copy of the cl_kernel
580         cl_program program = job->programs[j];
581 
582         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
583                                     &tinfo->outBuf[j])))
584         {
585             LogBuildError(program);
586             return error;
587         }
588         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
589                                     &tinfo->inBuf)))
590         {
591             LogBuildError(program);
592             return error;
593         }
594         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
595                                     &tinfo->inBuf2)))
596         {
597             LogBuildError(program);
598             return error;
599         }
600 
601         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
602                                             &vectorCount, NULL, 0, NULL, NULL)))
603         {
604             vlog_error("FAILED -- could not execute kernel\n");
605             goto exit;
606         }
607     }
608 
609     // Get that moving
610     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
611 
612     if (gSkipCorrectnessTesting) return CL_SUCCESS;
613 
614     // Calculate the correctly rounded reference result
615     r = (float *)gOut_Ref + thread_id * buffer_elements;
616     s = (float *)gIn + thread_id * buffer_elements;
617     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
618     for (size_t j = 0; j < buffer_elements; j++)
619         r[j] = (float)func.f_fi(s[j], s2[j]);
620 
621     // Read the data back -- no need to wait for the first N-1 buffers but wait
622     // for the last buffer. This is an in order queue.
623     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
624     {
625         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
626         out[j] = (cl_uint *)clEnqueueMapBuffer(
627             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
628             buffer_size, 0, NULL, NULL, &error);
629         if (error || NULL == out[j])
630         {
631             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
632                        error);
633             goto exit;
634         }
635     }
636 
637     // Verify data
638     t = (cl_uint *)r;
639     for (size_t j = 0; j < buffer_elements; j++)
640     {
641         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
642         {
643             cl_uint *q = out[k];
644 
645             // If we aren't getting the correctly rounded result
646             if (t[j] != q[j])
647             {
648                 float test = ((float *)q)[j];
649                 double correct = func.f_fi(s[j], s2[j]);
650                 float err = Ulp_Error(test, correct);
651                 int fail = !(fabsf(err) <= ulps);
652 
653                 if (fail && ftz)
654                 {
655                     // retry per section 6.5.3.2
656                     if (IsFloatResultSubnormal(correct, ulps))
657                     {
658                         fail = fail && (test != 0.0f);
659                         if (!fail) err = 0.0f;
660                     }
661 
662                     // retry per section 6.5.3.3
663                     if (IsFloatSubnormal(s[j]))
664                     {
665                         double correct2, correct3;
666                         float err2, err3;
667                         correct2 = func.f_fi(0.0, s2[j]);
668                         correct3 = func.f_fi(-0.0, s2[j]);
669                         err2 = Ulp_Error(test, correct2);
670                         err3 = Ulp_Error(test, correct3);
671                         fail = fail
672                             && ((!(fabsf(err2) <= ulps))
673                                 && (!(fabsf(err3) <= ulps)));
674                         if (fabsf(err2) < fabsf(err)) err = err2;
675                         if (fabsf(err3) < fabsf(err)) err = err3;
676 
677                         // retry per section 6.5.3.4
678                         if (IsFloatResultSubnormal(correct2, ulps)
679                             || IsFloatResultSubnormal(correct3, ulps))
680                         {
681                             fail = fail && (test != 0.0f);
682                             if (!fail) err = 0.0f;
683                         }
684                     }
685                 }
686 
687                 if (fabsf(err) > tinfo->maxError)
688                 {
689                     tinfo->maxError = fabsf(err);
690                     tinfo->maxErrorValue = s[j];
691                     tinfo->maxErrorValue2 = s2[j];
692                 }
693                 if (fail)
694                 {
695                     vlog_error(
696                         "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
697                         "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
698                         name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
699                         s2[j], r[j], ((uint32_t *)r)[j], test,
700                         ((cl_uint *)&test)[0], j);
701                     error = -1;
702                     goto exit;
703                 }
704             }
705         }
706     }
707 
708     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
709     {
710         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
711                                              out[j], 0, NULL, NULL)))
712         {
713             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
714                        j, error);
715             return error;
716         }
717     }
718 
719     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
720 
721 
722     if (0 == (base & 0x0fffffff))
723     {
724         if (gVerboseBruteForce)
725         {
726             vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
727                  "ThreadCount:%2u\n",
728                  base, job->step, job->scale, buffer_elements, job->ulps,
729                  job->threadCount);
730         }
731         else
732         {
733             vlog(".");
734         }
735         fflush(stdout);
736     }
737 
738 exit:
739     return error;
740 }
741