1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
23 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
24 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)25 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
26                        cl_kernel *k, cl_program *p, bool relaxedMode)
27 {
28     const char *c[] = { "__kernel void math_kernel",
29                         sizeNames[vectorSize],
30                         "( __global float",
31                         sizeNames[vectorSize],
32                         "* out, __global float",
33                         sizeNames[vectorSize],
34                         "* in1, __global float",
35                         sizeNames[vectorSize],
36                         "* in2 )\n"
37                         "{\n"
38                         "   size_t i = get_global_id(0);\n"
39                         "   out[i] = ",
40                         name,
41                         "( in1[i], in2[i] );\n"
42                         "}\n" };
43 
44     const char *c3[] = {
45         "__kernel void math_kernel",
46         sizeNames[vectorSize],
47         "( __global float* out, __global float* in, __global float* in2)\n"
48         "{\n"
49         "   size_t i = get_global_id(0);\n"
50         "   if( i + 1 < get_global_size(0) )\n"
51         "   {\n"
52         "       float3 f0 = vload3( 0, in + 3 * i );\n"
53         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
54         "       f0 = ",
55         name,
56         "( f0, f1 );\n"
57         "       vstore3( f0, 0, out + 3*i );\n"
58         "   }\n"
59         "   else\n"
60         "   {\n"
61         "       size_t parity = i & 1;   // Figure out how many elements are "
62         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
63         "buffer size \n"
64         "       float3 f0;\n"
65         "       float3 f1;\n"
66         "       switch( parity )\n"
67         "       {\n"
68         "           case 1:\n"
69         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
70         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
71         "               break;\n"
72         "           case 0:\n"
73         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
74         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
75         "               break;\n"
76         "       }\n"
77         "       f0 = ",
78         name,
79         "( f0, f1 );\n"
80         "       switch( parity )\n"
81         "       {\n"
82         "           case 0:\n"
83         "               out[3*i+1] = f0.y; \n"
84         "               // fall through\n"
85         "           case 1:\n"
86         "               out[3*i] = f0.x; \n"
87         "               break;\n"
88         "       }\n"
89         "   }\n"
90         "}\n"
91     };
92 
93     const char **kern = c;
94     size_t kernSize = sizeof(c) / sizeof(c[0]);
95 
96     if (sizeValues[vectorSize] == 3)
97     {
98         kern = c3;
99         kernSize = sizeof(c3) / sizeof(c3[0]);
100     }
101 
102     char testName[32];
103     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
104              sizeNames[vectorSize]);
105 
106     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
107                        relaxedMode);
108 }
109 
110 typedef struct BuildKernelInfo
111 {
112     cl_uint offset; // the first vector size to build
113     cl_uint kernel_count;
114     cl_kernel **kernels;
115     cl_program *programs;
116     const char *nameInCode;
117     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
118 } BuildKernelInfo;
119 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)120 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
121 {
122     BuildKernelInfo *info = (BuildKernelInfo *)p;
123     cl_uint i = info->offset + job_id;
124     return BuildKernel(info->nameInCode, i, info->kernel_count,
125                        info->kernels[i], info->programs + i, info->relaxedMode);
126 }
127 
128 // Thread specific data for a worker thread
129 typedef struct ThreadInfo
130 {
131     cl_mem inBuf; // input buffer for the thread
132     cl_mem inBuf2; // input buffer for the thread
133     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
134     float maxError; // max error value. Init to 0.
135     double
136         maxErrorValue; // position of the max error value (param 1).  Init to 0.
137     double maxErrorValue2; // position of the max error value (param 2).  Init
138                            // to 0.
139     MTdata d;
140     cl_command_queue tQueue; // per thread command queue to improve performance
141 } ThreadInfo;
142 
143 typedef struct TestInfo
144 {
145     size_t subBufferSize; // Size of the sub-buffer in elements
146     const Func *f; // A pointer to the function info
147     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
148     cl_kernel
149         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
150                                // worker thread:  k[vector_size][thread_id]
151     ThreadInfo *
152         tinfo; // An array of thread specific information for each worker thread
153     cl_uint threadCount; // Number of worker threads
154     cl_uint jobCount; // Number of jobs
155     cl_uint step; // step between each chunk and the next.
156     cl_uint scale; // stride between individual test values
157     float ulps; // max_allowed ulps
158     int ftz; // non-zero if running in flush to zero mode
159 
160     int isFDim;
161     int skipNanInf;
162     int isNextafter;
163     bool relaxedMode; // True if test is running in relaxed mode, false
164                       // otherwise.
165 } TestInfo;
166 
167 // A table of more difficult cases to get right
168 static const float specialValues[] = {
169     -NAN,
170     -INFINITY,
171     -FLT_MAX,
172     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
173     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
174     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
175     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
176     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
177     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
178     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
179     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
180     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
181     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
182     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
183     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
184     -1000.f,
185     -100.f,
186     -4.0f,
187     -3.5f,
188     -3.0f,
189     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
190     -2.5f,
191     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
192     -2.0f,
193     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
194     -1.5f,
195     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
196     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
197     -1.0f,
198     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
199     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
200     -0.5f,
201     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
202     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
203     -0.25f,
204     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
205     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
206     -FLT_MIN,
207     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
208     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
209     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
210     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
211     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
212     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
213     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
214     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
215     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
216     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
217     -0.0f,
218 
219     +NAN,
220     +INFINITY,
221     +FLT_MAX,
222     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
223     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
224     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
225     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
226     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
227     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
228     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
229     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
230     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
231     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
232     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
233     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
234     +1000.f,
235     +100.f,
236     +4.0f,
237     +3.5f,
238     +3.0f,
239     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
240     2.5f,
241     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
242     +2.0f,
243     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
244     1.5f,
245     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
246     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
247     +1.0f,
248     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
249     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
250     +0.5f,
251     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
252     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
253     +0.25f,
254     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
255     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
256     +FLT_MIN,
257     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
258     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
259     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
260     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
261     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
262     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
263     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
264     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
265     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
266     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
267     +0.0f,
268 };
269 
270 static const size_t specialValuesCount =
271     sizeof(specialValues) / sizeof(specialValues[0]);
272 
273 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
274 
TestFunc_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)275 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
276 {
277     TestInfo test_info;
278     cl_int error;
279     float maxError = 0.0f;
280     double maxErrorVal = 0.0;
281     double maxErrorVal2 = 0.0;
282 
283     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
284 
285     // Init test_info
286     memset(&test_info, 0, sizeof(test_info));
287     test_info.threadCount = GetThreadCount();
288     test_info.subBufferSize = BUFFER_SIZE
289         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
290     test_info.scale = getTestScale(sizeof(cl_float));
291 
292     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
293     if (test_info.step / test_info.subBufferSize != test_info.scale)
294     {
295         // there was overflow
296         test_info.jobCount = 1;
297     }
298     else
299     {
300         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
301     }
302 
303     test_info.f = f;
304     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
305     test_info.ftz =
306         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
307     test_info.relaxedMode = relaxedMode;
308     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
309     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
310     test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
311 
312     // cl_kernels aren't thread safe, so we make one for each vector size for
313     // every thread
314     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
315     {
316         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
317         test_info.k[i] = (cl_kernel *)malloc(array_size);
318         if (NULL == test_info.k[i])
319         {
320             vlog_error("Error: Unable to allocate storage for kernels!\n");
321             error = CL_OUT_OF_HOST_MEMORY;
322             goto exit;
323         }
324         memset(test_info.k[i], 0, array_size);
325     }
326     test_info.tinfo =
327         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
328     if (NULL == test_info.tinfo)
329     {
330         vlog_error(
331             "Error: Unable to allocate storage for thread specific data.\n");
332         error = CL_OUT_OF_HOST_MEMORY;
333         goto exit;
334     }
335     memset(test_info.tinfo, 0,
336            test_info.threadCount * sizeof(*test_info.tinfo));
337     for (cl_uint i = 0; i < test_info.threadCount; i++)
338     {
339         cl_buffer_region region = {
340             i * test_info.subBufferSize * sizeof(cl_float),
341             test_info.subBufferSize * sizeof(cl_float)
342         };
343         test_info.tinfo[i].inBuf =
344             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
345                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
346         if (error || NULL == test_info.tinfo[i].inBuf)
347         {
348             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
349                        "region {%zd, %zd}\n",
350                        region.origin, region.size);
351             goto exit;
352         }
353         test_info.tinfo[i].inBuf2 =
354             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
355                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
356         if (error || NULL == test_info.tinfo[i].inBuf2)
357         {
358             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
359                        "region {%zd, %zd}\n",
360                        region.origin, region.size);
361             goto exit;
362         }
363 
364         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
365         {
366             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
367                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
368                 &region, &error);
369             if (error || NULL == test_info.tinfo[i].outBuf[j])
370             {
371                 vlog_error("Error: Unable to create sub-buffer of "
372                            "gOutBuffer[%d] for region {%zd, %zd}\n",
373                            (int)j, region.origin, region.size);
374                 goto exit;
375             }
376         }
377         test_info.tinfo[i].tQueue =
378             clCreateCommandQueue(gContext, gDevice, 0, &error);
379         if (NULL == test_info.tinfo[i].tQueue || error)
380         {
381             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
382             goto exit;
383         }
384 
385         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
386     }
387 
388     // Init the kernels
389     {
390         BuildKernelInfo build_info = {
391             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
392             test_info.programs,  f->nameInCode,         relaxedMode
393         };
394         if ((error = ThreadPool_Do(BuildKernelFn,
395                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
396                                    &build_info)))
397             goto exit;
398     }
399 
400     // Run the kernels
401     if (!gSkipCorrectnessTesting)
402     {
403         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
404 
405         // Accumulate the arithmetic errors
406         for (cl_uint i = 0; i < test_info.threadCount; i++)
407         {
408             if (test_info.tinfo[i].maxError > maxError)
409             {
410                 maxError = test_info.tinfo[i].maxError;
411                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
412                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
413             }
414         }
415 
416         if (error) goto exit;
417 
418         if (gWimpyMode)
419             vlog("Wimp pass");
420         else
421             vlog("passed");
422 
423         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
424     }
425 
426     vlog("\n");
427 
428 exit:
429     // Release
430     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
431     {
432         clReleaseProgram(test_info.programs[i]);
433         if (test_info.k[i])
434         {
435             for (cl_uint j = 0; j < test_info.threadCount; j++)
436                 clReleaseKernel(test_info.k[i][j]);
437 
438             free(test_info.k[i]);
439         }
440     }
441     if (test_info.tinfo)
442     {
443         for (cl_uint i = 0; i < test_info.threadCount; i++)
444         {
445             free_mtdata(test_info.tinfo[i].d);
446             clReleaseMemObject(test_info.tinfo[i].inBuf);
447             clReleaseMemObject(test_info.tinfo[i].inBuf2);
448             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
449                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
450             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
451         }
452 
453         free(test_info.tinfo);
454     }
455 
456     return error;
457 }
458 
Test(cl_uint job_id,cl_uint thread_id,void * data)459 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
460 {
461     const TestInfo *job = (const TestInfo *)data;
462     size_t buffer_elements = job->subBufferSize;
463     size_t buffer_size = buffer_elements * sizeof(cl_float);
464     cl_uint base = job_id * (cl_uint)job->step;
465     ThreadInfo *tinfo = job->tinfo + thread_id;
466     fptr func = job->f->func;
467     int ftz = job->ftz;
468     bool relaxedMode = job->relaxedMode;
469     float ulps = getAllowedUlpError(job->f, relaxedMode);
470     MTdata d = tinfo->d;
471     cl_int error;
472     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
473     const char *name = job->f->name;
474     int isFDim = job->isFDim;
475     int skipNanInf = job->skipNanInf;
476     int isNextafter = job->isNextafter;
477     cl_uint *t = 0;
478     cl_float *r = 0;
479     cl_float *s = 0;
480     cl_float *s2 = 0;
481     cl_int copysign_test = 0;
482     RoundingMode oldRoundMode;
483     int skipVerification = 0;
484 
485     if (relaxedMode)
486     {
487         func = job->f->rfunc;
488         if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
489         {
490             ulps = INFINITY;
491             skipVerification = 1;
492         }
493     }
494 
495     // start the map of the output arrays
496     cl_event e[VECTOR_SIZE_COUNT];
497     cl_uint *out[VECTOR_SIZE_COUNT];
498     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
499     {
500         out[j] = (cl_uint *)clEnqueueMapBuffer(
501             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
502             buffer_size, 0, NULL, e + j, &error);
503         if (error || NULL == out[j])
504         {
505             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
506                        error);
507             return error;
508         }
509     }
510 
511     // Get that moving
512     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
513 
514     // Init input array
515     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
516     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
517     cl_uint idx = 0;
518     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
519     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
520 
521     if (job_id <= (cl_uint)lastSpecialJobIndex)
522     { // test edge cases
523         float *fp = (float *)p;
524         float *fp2 = (float *)p2;
525         uint32_t x, y;
526 
527         x = (job_id * buffer_elements) % specialValuesCount;
528         y = (job_id * buffer_elements) / specialValuesCount;
529 
530         for (; idx < buffer_elements; idx++)
531         {
532             fp[idx] = specialValues[x];
533             fp2[idx] = specialValues[y];
534             ++x;
535             if (x >= specialValuesCount)
536             {
537                 x = 0;
538                 y++;
539                 if (y >= specialValuesCount) break;
540             }
541         }
542     }
543 
544     // Init any remaining values.
545     for (; idx < buffer_elements; idx++)
546     {
547         p[idx] = genrand_int32(d);
548         p2[idx] = genrand_int32(d);
549     }
550 
551     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
552                                       buffer_size, p, 0, NULL, NULL)))
553     {
554         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
555         goto exit;
556     }
557 
558     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
559                                       buffer_size, p2, 0, NULL, NULL)))
560     {
561         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
562         goto exit;
563     }
564 
565     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
566     {
567         // Wait for the map to finish
568         if ((error = clWaitForEvents(1, e + j)))
569         {
570             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
571             goto exit;
572         }
573         if ((error = clReleaseEvent(e[j])))
574         {
575             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
576             goto exit;
577         }
578 
579         // Fill the result buffer with garbage, so that old results don't carry
580         // over
581         uint32_t pattern = 0xffffdead;
582         memset_pattern4(out[j], &pattern, buffer_size);
583         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
584                                              out[j], 0, NULL, NULL)))
585         {
586             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
587             goto exit;
588         }
589 
590         // run the kernel
591         size_t vectorCount =
592             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
593         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
594                                                  // own copy of the cl_kernel
595         cl_program program = job->programs[j];
596 
597         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
598                                     &tinfo->outBuf[j])))
599         {
600             LogBuildError(program);
601             return error;
602         }
603         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
604                                     &tinfo->inBuf)))
605         {
606             LogBuildError(program);
607             return error;
608         }
609         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
610                                     &tinfo->inBuf2)))
611         {
612             LogBuildError(program);
613             return error;
614         }
615 
616         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
617                                             &vectorCount, NULL, 0, NULL, NULL)))
618         {
619             vlog_error("FAILED -- could not execute kernel\n");
620             goto exit;
621         }
622     }
623 
624     // Get that moving
625     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
626 
627     if (gSkipCorrectnessTesting)
628     {
629         if ((error = clFinish(tinfo->tQueue)))
630         {
631             vlog_error("Error: clFinish failed! err: %d\n", error);
632             goto exit;
633         }
634         free(overflow);
635         return CL_SUCCESS;
636     }
637 
638     FPU_mode_type oldMode;
639     oldRoundMode = kRoundToNearestEven;
640     if (isFDim)
641     {
642         // Calculate the correctly rounded reference result
643         memset(&oldMode, 0, sizeof(oldMode));
644         if (ftz) ForceFTZ(&oldMode);
645 
646         // Set the rounding mode to match the device
647         if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
648     }
649 
650     if (!strcmp(name, "copysign")) copysign_test = 1;
651 
652 #define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
653 
654     // Calculate the correctly rounded reference result
655     r = (float *)gOut_Ref + thread_id * buffer_elements;
656     s = (float *)gIn + thread_id * buffer_elements;
657     s2 = (float *)gIn2 + thread_id * buffer_elements;
658     if (skipNanInf)
659     {
660         for (size_t j = 0; j < buffer_elements; j++)
661         {
662             feclearexcept(FE_OVERFLOW);
663             r[j] = (float)ref_func(s[j], s2[j]);
664             overflow[j] =
665                 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
666         }
667     }
668     else
669     {
670         for (size_t j = 0; j < buffer_elements; j++)
671             r[j] = (float)ref_func(s[j], s2[j]);
672     }
673 
674     if (isFDim && ftz) RestoreFPState(&oldMode);
675 
676     // Read the data back -- no need to wait for the first N-1 buffers but wait
677     // for the last buffer. This is an in order queue.
678     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
679     {
680         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
681         out[j] = (cl_uint *)clEnqueueMapBuffer(
682             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
683             buffer_size, 0, NULL, NULL, &error);
684         if (error || NULL == out[j])
685         {
686             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
687                        error);
688             goto exit;
689         }
690     }
691 
692     if (!skipVerification)
693     {
694         // Verify data
695         t = (cl_uint *)r;
696         for (size_t j = 0; j < buffer_elements; j++)
697         {
698             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
699             {
700                 cl_uint *q = out[k];
701 
702                 // If we aren't getting the correctly rounded result
703                 if (t[j] != q[j])
704                 {
705                     float test = ((float *)q)[j];
706                     double correct = ref_func(s[j], s2[j]);
707 
708                     // Per section 10 paragraph 6, accept any result if an input
709                     // or output is a infinity or NaN or overflow As per
710                     // OpenCL 2.0 spec, section 5.8.4.3, enabling
711                     // fast-relaxed-math mode also enables -cl-finite-math-only
712                     // optimization. This optimization allows to assume that
713                     // arguments and results are not NaNs or +/-INFs. Hence,
714                     // accept any result if inputs or results are NaNs or INFs.
715                     if (relaxedMode || skipNanInf)
716                     {
717                         if (skipNanInf && overflow[j]) continue;
718                         // Note: no double rounding here.  Reference functions
719                         // calculate in single precision.
720                         if (IsFloatInfinity(correct) || IsFloatNaN(correct)
721                             || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
722                             || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
723                             continue;
724                     }
725 
726                     float err = Ulp_Error(test, correct);
727                     int fail = !(fabsf(err) <= ulps);
728 
729                     if (fail && ftz)
730                     {
731                         // retry per section 6.5.3.2
732                         if (IsFloatResultSubnormal(correct, ulps))
733                         {
734                             fail = fail && (test != 0.0f);
735                             if (!fail) err = 0.0f;
736                         }
737 
738                         // nextafter on FTZ platforms may return the smallest
739                         // normal float (2^-126) given a denormal or a zero
740                         // as the first argument. The rationale here is that
741                         // nextafter flushes the argument to zero and then
742                         // returns the next representable number in the
743                         // direction of the second argument, and since
744                         // denorms are considered as zero, the smallest
745                         // normal number is the next representable number.
746                         // In which case, it should have the same sign as the
747                         // second argument.
748                         if (isNextafter)
749                         {
750                             if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
751                             {
752                                 float value = copysignf(twoToMinus126, s2[j]);
753                                 fail = fail && (test != value);
754                                 if (!fail) err = 0.0f;
755                             }
756                         }
757                         else
758                         {
759                             // retry per section 6.5.3.3
760                             if (IsFloatSubnormal(s[j]))
761                             {
762                                 double correct2, correct3;
763                                 float err2, err3;
764 
765                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
766 
767                                 correct2 = ref_func(0.0, s2[j]);
768                                 correct3 = ref_func(-0.0, s2[j]);
769 
770                                 // Per section 10 paragraph 6, accept any result
771                                 // if an input or output is a infinity or NaN or
772                                 // overflow As per OpenCL 2.0 spec,
773                                 // section 5.8.4.3, enabling fast-relaxed-math
774                                 // mode also enables -cl-finite-math-only
775                                 // optimization. This optimization allows to
776                                 // assume that arguments and results are not
777                                 // NaNs or +/-INFs. Hence, accept any result if
778                                 // inputs or results are NaNs or INFs.
779                                 if (relaxedMode || skipNanInf)
780                                 {
781                                     if (fetestexcept(FE_OVERFLOW) && skipNanInf)
782                                         continue;
783 
784                                     // Note: no double rounding here.  Reference
785                                     // functions calculate in single precision.
786                                     if (IsFloatInfinity(correct2)
787                                         || IsFloatNaN(correct2)
788                                         || IsFloatInfinity(correct3)
789                                         || IsFloatNaN(correct3))
790                                         continue;
791                                 }
792 
793                                 err2 = Ulp_Error(test, correct2);
794                                 err3 = Ulp_Error(test, correct3);
795                                 fail = fail
796                                     && ((!(fabsf(err2) <= ulps))
797                                         && (!(fabsf(err3) <= ulps)));
798                                 if (fabsf(err2) < fabsf(err)) err = err2;
799                                 if (fabsf(err3) < fabsf(err)) err = err3;
800 
801                                 // retry per section 6.5.3.4
802                                 if (IsFloatResultSubnormal(correct2, ulps)
803                                     || IsFloatResultSubnormal(correct3, ulps))
804                                 {
805                                     fail = fail && (test != 0.0f);
806                                     if (!fail) err = 0.0f;
807                                 }
808 
809                                 // try with both args as zero
810                                 if (IsFloatSubnormal(s2[j]))
811                                 {
812                                     double correct4, correct5;
813                                     float err4, err5;
814 
815                                     if (skipNanInf) feclearexcept(FE_OVERFLOW);
816 
817                                     correct2 = ref_func(0.0, 0.0);
818                                     correct3 = ref_func(-0.0, 0.0);
819                                     correct4 = ref_func(0.0, -0.0);
820                                     correct5 = ref_func(-0.0, -0.0);
821 
822                                     // Per section 10 paragraph 6, accept any
823                                     // result if an input or output is a
824                                     // infinity or NaN or overflow As per
825                                     // OpenCL 2.0 spec, section 5.8.4.3,
826                                     // enabling fast-relaxed-math mode also
827                                     // enables -cl-finite-math-only
828                                     // optimization. This optimization allows to
829                                     // assume that arguments and results are not
830                                     // NaNs or +/-INFs. Hence, accept any result
831                                     // if inputs or results are NaNs or INFs.
832                                     if (relaxedMode || skipNanInf)
833                                     {
834                                         if (fetestexcept(FE_OVERFLOW)
835                                             && skipNanInf)
836                                             continue;
837 
838                                         // Note: no double rounding here.
839                                         // Reference functions calculate in
840                                         // single precision.
841                                         if (IsFloatInfinity(correct2)
842                                             || IsFloatNaN(correct2)
843                                             || IsFloatInfinity(correct3)
844                                             || IsFloatNaN(correct3)
845                                             || IsFloatInfinity(correct4)
846                                             || IsFloatNaN(correct4)
847                                             || IsFloatInfinity(correct5)
848                                             || IsFloatNaN(correct5))
849                                             continue;
850                                     }
851 
852                                     err2 = Ulp_Error(test, correct2);
853                                     err3 = Ulp_Error(test, correct3);
854                                     err4 = Ulp_Error(test, correct4);
855                                     err5 = Ulp_Error(test, correct5);
856                                     fail = fail
857                                         && ((!(fabsf(err2) <= ulps))
858                                             && (!(fabsf(err3) <= ulps))
859                                             && (!(fabsf(err4) <= ulps))
860                                             && (!(fabsf(err5) <= ulps)));
861                                     if (fabsf(err2) < fabsf(err)) err = err2;
862                                     if (fabsf(err3) < fabsf(err)) err = err3;
863                                     if (fabsf(err4) < fabsf(err)) err = err4;
864                                     if (fabsf(err5) < fabsf(err)) err = err5;
865 
866                                     // retry per section 6.5.3.4
867                                     if (IsFloatResultSubnormal(correct2, ulps)
868                                         || IsFloatResultSubnormal(correct3,
869                                                                   ulps)
870                                         || IsFloatResultSubnormal(correct4,
871                                                                   ulps)
872                                         || IsFloatResultSubnormal(correct5,
873                                                                   ulps))
874                                     {
875                                         fail = fail && (test != 0.0f);
876                                         if (!fail) err = 0.0f;
877                                     }
878                                 }
879                             }
880                             else if (IsFloatSubnormal(s2[j]))
881                             {
882                                 double correct2, correct3;
883                                 float err2, err3;
884 
885                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
886 
887                                 correct2 = ref_func(s[j], 0.0);
888                                 correct3 = ref_func(s[j], -0.0);
889 
890                                 // Per section 10 paragraph 6, accept any result
891                                 // if an input or output is a infinity or NaN or
892                                 // overflow As per OpenCL 2.0 spec,
893                                 // section 5.8.4.3, enabling fast-relaxed-math
894                                 // mode also enables -cl-finite-math-only
895                                 // optimization. This optimization allows to
896                                 // assume that arguments and results are not
897                                 // NaNs or +/-INFs. Hence, accept any result if
898                                 // inputs or results are NaNs or INFs.
899                                 if (relaxedMode || skipNanInf)
900                                 {
901                                     // Note: no double rounding here.  Reference
902                                     // functions calculate in single precision.
903                                     if (overflow[j] && skipNanInf) continue;
904 
905                                     if (IsFloatInfinity(correct2)
906                                         || IsFloatNaN(correct2)
907                                         || IsFloatInfinity(correct3)
908                                         || IsFloatNaN(correct3))
909                                         continue;
910                                 }
911 
912                                 err2 = Ulp_Error(test, correct2);
913                                 err3 = Ulp_Error(test, correct3);
914                                 fail = fail
915                                     && ((!(fabsf(err2) <= ulps))
916                                         && (!(fabsf(err3) <= ulps)));
917                                 if (fabsf(err2) < fabsf(err)) err = err2;
918                                 if (fabsf(err3) < fabsf(err)) err = err3;
919 
920                                 // retry per section 6.5.3.4
921                                 if (IsFloatResultSubnormal(correct2, ulps)
922                                     || IsFloatResultSubnormal(correct3, ulps))
923                                 {
924                                     fail = fail && (test != 0.0f);
925                                     if (!fail) err = 0.0f;
926                                 }
927                             }
928                         }
929                     }
930 
931                     if (fabsf(err) > tinfo->maxError)
932                     {
933                         tinfo->maxError = fabsf(err);
934                         tinfo->maxErrorValue = s[j];
935                         tinfo->maxErrorValue2 = s2[j];
936                     }
937                     if (fail)
938                     {
939                         vlog_error(
940                             "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
941                             "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n",
942                             name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
943                             s2[j], ((cl_uint *)s2)[j], r[j], test,
944                             ((cl_uint *)&test)[0], j);
945                         error = -1;
946                         goto exit;
947                     }
948                 }
949             }
950         }
951     }
952 
953     if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
954 
955     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
956     {
957         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
958                                              out[j], 0, NULL, NULL)))
959         {
960             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
961                        j, error);
962             return error;
963         }
964     }
965 
966     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
967 
968 
969     if (0 == (base & 0x0fffffff))
970     {
971         if (gVerboseBruteForce)
972         {
973             vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
974                  "ThreadCount:%2u\n",
975                  base, job->step, job->scale, buffer_elements, job->ulps,
976                  job->threadCount);
977         }
978         else
979         {
980             vlog(".");
981         }
982         fflush(stdout);
983     }
984 
985 exit:
986     if (overflow) free(overflow);
987     return error;
988 }
989