1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
BuildKernel(const char * operator_symbol,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *operator_symbol, int vectorSize,
24                        cl_uint kernel_count, cl_kernel *k, cl_program *p,
25                        bool relaxedMode)
26 {
27     const char *c[] = { "__kernel void math_kernel",
28                         sizeNames[vectorSize],
29                         "( __global float",
30                         sizeNames[vectorSize],
31                         "* out, __global float",
32                         sizeNames[vectorSize],
33                         "* in1, __global float",
34                         sizeNames[vectorSize],
35                         "* in2 )\n"
36                         "{\n"
37                         "   size_t i = get_global_id(0);\n"
38                         "   out[i] = in1[i] ",
39                         operator_symbol,
40                         " in2[i];\n"
41                         "}\n" };
42 
43     const char *c3[] = {
44         "__kernel void math_kernel",
45         sizeNames[vectorSize],
46         "( __global float* out, __global float* in, __global float* in2)\n"
47         "{\n"
48         "   size_t i = get_global_id(0);\n"
49         "   if( i + 1 < get_global_size(0) )\n"
50         "   {\n"
51         "       float3 f0 = vload3( 0, in + 3 * i );\n"
52         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
53         "       f0 = f0 ",
54         operator_symbol,
55         " f1;\n"
56         "       vstore3( f0, 0, out + 3*i );\n"
57         "   }\n"
58         "   else\n"
59         "   {\n"
60         "       size_t parity = i & 1;   // Figure out how many elements are "
61         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
62         "buffer size \n"
63         "       float3 f0;\n"
64         "       float3 f1;\n"
65         "       switch( parity )\n"
66         "       {\n"
67         "           case 1:\n"
68         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
69         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
70         "               break;\n"
71         "           case 0:\n"
72         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
73         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
74         "               break;\n"
75         "       }\n"
76         "       f0 = f0 ",
77         operator_symbol,
78         " f1;\n"
79         "       switch( parity )\n"
80         "       {\n"
81         "           case 0:\n"
82         "               out[3*i+1] = f0.y; \n"
83         "               // fall through\n"
84         "           case 1:\n"
85         "               out[3*i] = f0.x; \n"
86         "               break;\n"
87         "       }\n"
88         "   }\n"
89         "}\n"
90     };
91 
92     const char **kern = c;
93     size_t kernSize = sizeof(c) / sizeof(c[0]);
94 
95     if (sizeValues[vectorSize] == 3)
96     {
97         kern = c3;
98         kernSize = sizeof(c3) / sizeof(c3[0]);
99     }
100 
101     char testName[32];
102     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
103              sizeNames[vectorSize]);
104 
105     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
106                        relaxedMode);
107 }
108 
109 typedef struct BuildKernelInfo
110 {
111     cl_uint offset; // the first vector size to build
112     cl_uint kernel_count;
113     cl_kernel **kernels;
114     cl_program *programs;
115     const char *operator_symbol;
116     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
117 } BuildKernelInfo;
118 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)119 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
120 {
121     BuildKernelInfo *info = (BuildKernelInfo *)p;
122     cl_uint i = info->offset + job_id;
123     return BuildKernel(info->operator_symbol, i, info->kernel_count,
124                        info->kernels[i], info->programs + i, info->relaxedMode);
125 }
126 
127 // Thread specific data for a worker thread
128 typedef struct ThreadInfo
129 {
130     cl_mem inBuf; // input buffer for the thread
131     cl_mem inBuf2; // input buffer for the thread
132     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
133     float maxError; // max error value. Init to 0.
134     double
135         maxErrorValue; // position of the max error value (param 1).  Init to 0.
136     double maxErrorValue2; // position of the max error value (param 2).  Init
137                            // to 0.
138     MTdata d;
139     cl_command_queue tQueue; // per thread command queue to improve performance
140 } ThreadInfo;
141 
142 typedef struct TestInfo
143 {
144     size_t subBufferSize; // Size of the sub-buffer in elements
145     const Func *f; // A pointer to the function info
146     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
147     cl_kernel
148         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
149                                // worker thread:  k[vector_size][thread_id]
150     ThreadInfo *
151         tinfo; // An array of thread specific information for each worker thread
152     cl_uint threadCount; // Number of worker threads
153     cl_uint jobCount; // Number of jobs
154     cl_uint step; // step between each chunk and the next.
155     cl_uint scale; // stride between individual test values
156     float ulps; // max_allowed ulps
157     int ftz; // non-zero if running in flush to zero mode
158     bool relaxedMode; // True if the test is being run in relaxed mode, false
159                       // otherwise.
160 
161     // no special fields
162 } TestInfo;
163 
164 // A table of more difficult cases to get right
165 static const float specialValues[] = {
166     -NAN,
167     -INFINITY,
168     -FLT_MAX,
169     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
170     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
171     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
172     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
173     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
174     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
175     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
176     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
177     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
178     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
179     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
180     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
181     -1000.f,
182     -100.f,
183     -4.0f,
184     -3.5f,
185     -3.0f,
186     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
187     -2.5f,
188     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
189     -2.0f,
190     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
191     -1.5f,
192     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
193     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
194     -1.0f,
195     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
196     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
197     -0.5f,
198     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
199     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
200     -0.25f,
201     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
202     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
203     -FLT_MIN,
204     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
205     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
206     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
207     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
208     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
209     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
210     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
211     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
212     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
213     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
214     -0.0f,
215 
216     +NAN,
217     +INFINITY,
218     +FLT_MAX,
219     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
220     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
221     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
222     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
223     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
224     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
225     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
226     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
227     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
228     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
229     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
230     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
231     +1000.f,
232     +100.f,
233     +4.0f,
234     +3.5f,
235     +3.0f,
236     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
237     2.5f,
238     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
239     +2.0f,
240     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
241     1.5f,
242     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
243     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
244     +1.0f,
245     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
246     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
247     +0.5f,
248     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
249     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
250     +0.25f,
251     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
252     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
253     +FLT_MIN,
254     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
255     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
256     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
257     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
258     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
259     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
260     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
261     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
262     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
263     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
264     +0.0f,
265 };
266 
267 static const size_t specialValuesCount =
268     sizeof(specialValues) / sizeof(specialValues[0]);
269 
270 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
271 
TestFunc_Float_Float_Float_Operator(const Func * f,MTdata d,bool relaxedMode)272 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
273                                         bool relaxedMode)
274 {
275     TestInfo test_info;
276     cl_int error;
277     float maxError = 0.0f;
278     double maxErrorVal = 0.0;
279     double maxErrorVal2 = 0.0;
280 
281     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
282 
283     // Init test_info
284     memset(&test_info, 0, sizeof(test_info));
285     test_info.threadCount = GetThreadCount();
286     test_info.subBufferSize = BUFFER_SIZE
287         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
288     test_info.scale = getTestScale(sizeof(cl_float));
289 
290     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
291     if (test_info.step / test_info.subBufferSize != test_info.scale)
292     {
293         // there was overflow
294         test_info.jobCount = 1;
295     }
296     else
297     {
298         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
299     }
300 
301     test_info.f = f;
302     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
303     test_info.ftz =
304         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
305     test_info.relaxedMode = relaxedMode;
306 
307     // cl_kernels aren't thread safe, so we make one for each vector size for
308     // every thread
309     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
310     {
311         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
312         test_info.k[i] = (cl_kernel *)malloc(array_size);
313         if (NULL == test_info.k[i])
314         {
315             vlog_error("Error: Unable to allocate storage for kernels!\n");
316             error = CL_OUT_OF_HOST_MEMORY;
317             goto exit;
318         }
319         memset(test_info.k[i], 0, array_size);
320     }
321     test_info.tinfo =
322         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
323     if (NULL == test_info.tinfo)
324     {
325         vlog_error(
326             "Error: Unable to allocate storage for thread specific data.\n");
327         error = CL_OUT_OF_HOST_MEMORY;
328         goto exit;
329     }
330     memset(test_info.tinfo, 0,
331            test_info.threadCount * sizeof(*test_info.tinfo));
332     for (cl_uint i = 0; i < test_info.threadCount; i++)
333     {
334         cl_buffer_region region = {
335             i * test_info.subBufferSize * sizeof(cl_float),
336             test_info.subBufferSize * sizeof(cl_float)
337         };
338         test_info.tinfo[i].inBuf =
339             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
340                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
341         if (error || NULL == test_info.tinfo[i].inBuf)
342         {
343             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
344                        "region {%zd, %zd}\n",
345                        region.origin, region.size);
346             goto exit;
347         }
348         test_info.tinfo[i].inBuf2 =
349             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
350                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
351         if (error || NULL == test_info.tinfo[i].inBuf2)
352         {
353             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
354                        "region {%zd, %zd}\n",
355                        region.origin, region.size);
356             goto exit;
357         }
358 
359         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
360         {
361             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
362                 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
363                 &region, &error);
364             if (error || NULL == test_info.tinfo[i].outBuf[j])
365             {
366                 vlog_error("Error: Unable to create sub-buffer of "
367                            "gOutBuffer[%d] for region {%zd, %zd}\n",
368                            (int)j, region.origin, region.size);
369                 goto exit;
370             }
371         }
372         test_info.tinfo[i].tQueue =
373             clCreateCommandQueue(gContext, gDevice, 0, &error);
374         if (NULL == test_info.tinfo[i].tQueue || error)
375         {
376             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
377             goto exit;
378         }
379 
380         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
381     }
382 
383     // Init the kernels
384     {
385         BuildKernelInfo build_info = {
386             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
387             test_info.programs,  f->nameInCode,         relaxedMode
388         };
389         if ((error = ThreadPool_Do(BuildKernelFn,
390                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
391                                    &build_info)))
392             goto exit;
393     }
394 
395     // Run the kernels
396     if (!gSkipCorrectnessTesting)
397     {
398         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
399 
400         // Accumulate the arithmetic errors
401         for (cl_uint i = 0; i < test_info.threadCount; i++)
402         {
403             if (test_info.tinfo[i].maxError > maxError)
404             {
405                 maxError = test_info.tinfo[i].maxError;
406                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
407                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
408             }
409         }
410 
411         if (error) goto exit;
412 
413         if (gWimpyMode)
414             vlog("Wimp pass");
415         else
416             vlog("passed");
417 
418         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
419     }
420 
421     vlog("\n");
422 
423 exit:
424     // Release
425     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
426     {
427         clReleaseProgram(test_info.programs[i]);
428         if (test_info.k[i])
429         {
430             for (cl_uint j = 0; j < test_info.threadCount; j++)
431                 clReleaseKernel(test_info.k[i][j]);
432 
433             free(test_info.k[i]);
434         }
435     }
436     if (test_info.tinfo)
437     {
438         for (cl_uint i = 0; i < test_info.threadCount; i++)
439         {
440             free_mtdata(test_info.tinfo[i].d);
441             clReleaseMemObject(test_info.tinfo[i].inBuf);
442             clReleaseMemObject(test_info.tinfo[i].inBuf2);
443             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
444                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
445             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
446         }
447 
448         free(test_info.tinfo);
449     }
450 
451     return error;
452 }
453 
Test(cl_uint job_id,cl_uint thread_id,void * data)454 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
455 {
456     const TestInfo *job = (const TestInfo *)data;
457     size_t buffer_elements = job->subBufferSize;
458     size_t buffer_size = buffer_elements * sizeof(cl_float);
459     cl_uint base = job_id * (cl_uint)job->step;
460     ThreadInfo *tinfo = job->tinfo + thread_id;
461     fptr func = job->f->func;
462     int ftz = job->ftz;
463     bool relaxedMode = job->relaxedMode;
464     float ulps = getAllowedUlpError(job->f, relaxedMode);
465     MTdata d = tinfo->d;
466     cl_int error;
467     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
468     const char *name = job->f->name;
469     cl_uint *t = 0;
470     cl_float *r = 0;
471     cl_float *s = 0;
472     cl_float *s2 = 0;
473     RoundingMode oldRoundMode;
474 
475     if (relaxedMode)
476     {
477         func = job->f->rfunc;
478     }
479 
480     // start the map of the output arrays
481     cl_event e[VECTOR_SIZE_COUNT];
482     cl_uint *out[VECTOR_SIZE_COUNT];
483     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
484     {
485         out[j] = (cl_uint *)clEnqueueMapBuffer(
486             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
487             buffer_size, 0, NULL, e + j, &error);
488         if (error || NULL == out[j])
489         {
490             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
491                        error);
492             return error;
493         }
494     }
495 
496     // Get that moving
497     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
498 
499     // Init input array
500     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
501     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
502     cl_uint idx = 0;
503     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
504     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
505 
506     if (job_id <= (cl_uint)lastSpecialJobIndex)
507     {
508         // Insert special values
509         uint32_t x, y;
510 
511         x = (job_id * buffer_elements) % specialValuesCount;
512         y = (job_id * buffer_elements) / specialValuesCount;
513 
514         for (; idx < buffer_elements; idx++)
515         {
516             p[idx] = ((cl_uint *)specialValues)[x];
517             p2[idx] = ((cl_uint *)specialValues)[y];
518             ++x;
519             if (x >= specialValuesCount)
520             {
521                 x = 0;
522                 y++;
523                 if (y >= specialValuesCount) break;
524             }
525             if (relaxedMode && strcmp(name, "divide") == 0)
526             {
527                 cl_uint pj = p[idx] & 0x7fffffff;
528                 cl_uint p2j = p2[idx] & 0x7fffffff;
529                 // Replace values outside [2^-62, 2^62] with QNaN
530                 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
531                 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
532             }
533         }
534     }
535 
536     // Init any remaining values.
537     for (; idx < buffer_elements; idx++)
538     {
539         p[idx] = genrand_int32(d);
540         p2[idx] = genrand_int32(d);
541 
542         if (relaxedMode && strcmp(name, "divide") == 0)
543         {
544             cl_uint pj = p[idx] & 0x7fffffff;
545             cl_uint p2j = p2[idx] & 0x7fffffff;
546             // Replace values outside [2^-62, 2^62] with QNaN
547             if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
548             if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
549         }
550     }
551 
552     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
553                                       buffer_size, p, 0, NULL, NULL)))
554     {
555         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
556         goto exit;
557     }
558 
559     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
560                                       buffer_size, p2, 0, NULL, NULL)))
561     {
562         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
563         goto exit;
564     }
565 
566     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
567     {
568         // Wait for the map to finish
569         if ((error = clWaitForEvents(1, e + j)))
570         {
571             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
572             goto exit;
573         }
574         if ((error = clReleaseEvent(e[j])))
575         {
576             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
577             goto exit;
578         }
579 
580         // Fill the result buffer with garbage, so that old results don't carry
581         // over
582         uint32_t pattern = 0xffffdead;
583         memset_pattern4(out[j], &pattern, buffer_size);
584         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
585                                              out[j], 0, NULL, NULL)))
586         {
587             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
588             goto exit;
589         }
590 
591         // run the kernel
592         size_t vectorCount =
593             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
594         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
595                                                  // own copy of the cl_kernel
596         cl_program program = job->programs[j];
597 
598         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
599                                     &tinfo->outBuf[j])))
600         {
601             LogBuildError(program);
602             return error;
603         }
604         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
605                                     &tinfo->inBuf)))
606         {
607             LogBuildError(program);
608             return error;
609         }
610         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
611                                     &tinfo->inBuf2)))
612         {
613             LogBuildError(program);
614             return error;
615         }
616 
617         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
618                                             &vectorCount, NULL, 0, NULL, NULL)))
619         {
620             vlog_error("FAILED -- could not execute kernel\n");
621             goto exit;
622         }
623     }
624 
625     // Get that moving
626     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
627 
628     if (gSkipCorrectnessTesting)
629     {
630         free(overflow);
631         return CL_SUCCESS;
632     }
633 
634     // Calculate the correctly rounded reference result
635     FPU_mode_type oldMode;
636     memset(&oldMode, 0, sizeof(oldMode));
637     if (ftz) ForceFTZ(&oldMode);
638 
639     // Set the rounding mode to match the device
640     oldRoundMode = kRoundToNearestEven;
641     if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
642 
643     // Calculate the correctly rounded reference result
644     r = (float *)gOut_Ref + thread_id * buffer_elements;
645     s = (float *)gIn + thread_id * buffer_elements;
646     s2 = (float *)gIn2 + thread_id * buffer_elements;
647     if (gInfNanSupport)
648     {
649         for (size_t j = 0; j < buffer_elements; j++)
650             r[j] = (float)func.f_ff(s[j], s2[j]);
651     }
652     else
653     {
654         for (size_t j = 0; j < buffer_elements; j++)
655         {
656             feclearexcept(FE_OVERFLOW);
657             r[j] = (float)func.f_ff(s[j], s2[j]);
658             overflow[j] =
659                 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
660         }
661     }
662 
663     if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
664 
665     if (ftz) RestoreFPState(&oldMode);
666 
667     // Read the data back -- no need to wait for the first N-1 buffers but wait
668     // for the last buffer. This is an in order queue.
669     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
670     {
671         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
672         out[j] = (cl_uint *)clEnqueueMapBuffer(
673             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
674             buffer_size, 0, NULL, NULL, &error);
675         if (error || NULL == out[j])
676         {
677             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
678                        error);
679             goto exit;
680         }
681     }
682 
683     // Verify data
684     t = (cl_uint *)r;
685     for (size_t j = 0; j < buffer_elements; j++)
686     {
687         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
688         {
689             cl_uint *q = out[k];
690 
691             // If we aren't getting the correctly rounded result
692             if (t[j] != q[j])
693             {
694                 float test = ((float *)q)[j];
695                 double correct = func.f_ff(s[j], s2[j]);
696 
697                 // Per section 10 paragraph 6, accept any result if an input or
698                 // output is a infinity or NaN or overflow
699                 if (!gInfNanSupport)
700                 {
701                     // Note: no double rounding here.  Reference functions
702                     // calculate in single precision.
703                     if (overflow[j] || IsFloatInfinity(correct)
704                         || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
705                         || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
706                         || IsFloatNaN(s[j]))
707                         continue;
708                 }
709 
710                 // Per section 10 paragraph 6, accept embedded devices always
711                 // returning positive 0.0.
712                 if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
713                     continue;
714 
715                 float err = Ulp_Error(test, correct);
716                 float errB = Ulp_Error(test, (float)correct);
717 
718                 int fail =
719                     ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
720                 if (fabsf(errB) < fabsf(err)) err = errB;
721 
722                 if (fail && ftz)
723                 {
724                     // retry per section 6.5.3.2
725                     if (IsFloatResultSubnormal(correct, ulps))
726                     {
727                         fail = fail && (test != 0.0f);
728                         if (!fail) err = 0.0f;
729                     }
730 
731                     // retry per section 6.5.3.3
732                     if (IsFloatSubnormal(s[j]))
733                     {
734                         double correct2, correct3;
735                         float err2, err3;
736 
737                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
738 
739                         correct2 = func.f_ff(0.0, s2[j]);
740                         correct3 = func.f_ff(-0.0, s2[j]);
741 
742                         // Per section 10 paragraph 6, accept any result if an
743                         // input or output is a infinity or NaN or overflow
744                         if (!gInfNanSupport)
745                         {
746                             if (fetestexcept(FE_OVERFLOW)) continue;
747 
748                             // Note: no double rounding here.  Reference
749                             // functions calculate in single precision.
750                             if (IsFloatInfinity(correct2)
751                                 || IsFloatNaN(correct2)
752                                 || IsFloatInfinity(correct3)
753                                 || IsFloatNaN(correct3))
754                                 continue;
755                         }
756 
757                         err2 = Ulp_Error(test, correct2);
758                         err3 = Ulp_Error(test, correct3);
759                         fail = fail
760                             && ((!(fabsf(err2) <= ulps))
761                                 && (!(fabsf(err3) <= ulps)));
762                         if (fabsf(err2) < fabsf(err)) err = err2;
763                         if (fabsf(err3) < fabsf(err)) err = err3;
764 
765                         // retry per section 6.5.3.4
766                         if (IsFloatResultSubnormal(correct2, ulps)
767                             || IsFloatResultSubnormal(correct3, ulps))
768                         {
769                             fail = fail && (test != 0.0f);
770                             if (!fail) err = 0.0f;
771                         }
772 
773                         // try with both args as zero
774                         if (IsFloatSubnormal(s2[j]))
775                         {
776                             double correct4, correct5;
777                             float err4, err5;
778 
779                             if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
780 
781                             correct2 = func.f_ff(0.0, 0.0);
782                             correct3 = func.f_ff(-0.0, 0.0);
783                             correct4 = func.f_ff(0.0, -0.0);
784                             correct5 = func.f_ff(-0.0, -0.0);
785 
786                             // Per section 10 paragraph 6, accept any result if
787                             // an input or output is a infinity or NaN or
788                             // overflow
789                             if (!gInfNanSupport)
790                             {
791                                 if (fetestexcept(FE_OVERFLOW)) continue;
792 
793                                 // Note: no double rounding here.  Reference
794                                 // functions calculate in single precision.
795                                 if (IsFloatInfinity(correct2)
796                                     || IsFloatNaN(correct2)
797                                     || IsFloatInfinity(correct3)
798                                     || IsFloatNaN(correct3)
799                                     || IsFloatInfinity(correct4)
800                                     || IsFloatNaN(correct4)
801                                     || IsFloatInfinity(correct5)
802                                     || IsFloatNaN(correct5))
803                                     continue;
804                             }
805 
806                             err2 = Ulp_Error(test, correct2);
807                             err3 = Ulp_Error(test, correct3);
808                             err4 = Ulp_Error(test, correct4);
809                             err5 = Ulp_Error(test, correct5);
810                             fail = fail
811                                 && ((!(fabsf(err2) <= ulps))
812                                     && (!(fabsf(err3) <= ulps))
813                                     && (!(fabsf(err4) <= ulps))
814                                     && (!(fabsf(err5) <= ulps)));
815                             if (fabsf(err2) < fabsf(err)) err = err2;
816                             if (fabsf(err3) < fabsf(err)) err = err3;
817                             if (fabsf(err4) < fabsf(err)) err = err4;
818                             if (fabsf(err5) < fabsf(err)) err = err5;
819 
820                             // retry per section 6.5.3.4
821                             if (IsFloatResultSubnormal(correct2, ulps)
822                                 || IsFloatResultSubnormal(correct3, ulps)
823                                 || IsFloatResultSubnormal(correct4, ulps)
824                                 || IsFloatResultSubnormal(correct5, ulps))
825                             {
826                                 fail = fail && (test != 0.0f);
827                                 if (!fail) err = 0.0f;
828                             }
829                         }
830                     }
831                     else if (IsFloatSubnormal(s2[j]))
832                     {
833                         double correct2, correct3;
834                         float err2, err3;
835 
836                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
837 
838                         correct2 = func.f_ff(s[j], 0.0);
839                         correct3 = func.f_ff(s[j], -0.0);
840 
841                         // Per section 10 paragraph 6, accept any result if an
842                         // input or output is a infinity or NaN or overflow
843                         if (!gInfNanSupport)
844                         {
845                             // Note: no double rounding here.  Reference
846                             // functions calculate in single precision.
847                             if (overflow[j] || IsFloatInfinity(correct)
848                                 || IsFloatNaN(correct)
849                                 || IsFloatInfinity(correct2)
850                                 || IsFloatNaN(correct2))
851                                 continue;
852                         }
853 
854                         err2 = Ulp_Error(test, correct2);
855                         err3 = Ulp_Error(test, correct3);
856                         fail = fail
857                             && ((!(fabsf(err2) <= ulps))
858                                 && (!(fabsf(err3) <= ulps)));
859                         if (fabsf(err2) < fabsf(err)) err = err2;
860                         if (fabsf(err3) < fabsf(err)) err = err3;
861 
862                         // retry per section 6.5.3.4
863                         if (IsFloatResultSubnormal(correct2, ulps)
864                             || IsFloatResultSubnormal(correct3, ulps))
865                         {
866                             fail = fail && (test != 0.0f);
867                             if (!fail) err = 0.0f;
868                         }
869                     }
870                 }
871 
872 
873                 if (fabsf(err) > tinfo->maxError)
874                 {
875                     tinfo->maxError = fabsf(err);
876                     tinfo->maxErrorValue = s[j];
877                     tinfo->maxErrorValue2 = s2[j];
878                 }
879                 if (fail)
880                 {
881                     vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
882                                "vs. %a (0x%8.8x) at index: %d\n",
883                                name, sizeNames[k], err, s[j], s2[j], r[j], test,
884                                ((cl_uint *)&test)[0], j);
885                     error = -1;
886                     goto exit;
887                 }
888             }
889         }
890     }
891 
892     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
893     {
894         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
895                                              out[j], 0, NULL, NULL)))
896         {
897             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
898                        j, error);
899             return error;
900         }
901     }
902 
903     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
904 
905 
906     if (0 == (base & 0x0fffffff))
907     {
908         if (gVerboseBruteForce)
909         {
910             vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
911                  "ThreadCount:%2u\n",
912                  base, job->step, job->scale, buffer_elements, job->ulps,
913                  job->threadCount);
914         }
915         else
916         {
917             vlog(".");
918         }
919         fflush(stdout);
920     }
921 
922 exit:
923     if (overflow) free(overflow);
924     return error;
925 }
926