1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
23 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
24 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)25 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
26                        cl_kernel *k, cl_program *p, bool relaxedMode)
27 {
28     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
29                         "__kernel void math_kernel",
30                         sizeNames[vectorSize],
31                         "( __global double",
32                         sizeNames[vectorSize],
33                         "* out, __global double",
34                         sizeNames[vectorSize],
35                         "* in1, __global double",
36                         sizeNames[vectorSize],
37                         "* in2 )\n"
38                         "{\n"
39                         "   size_t i = get_global_id(0);\n"
40                         "   out[i] = ",
41                         name,
42                         "( in1[i], in2[i] );\n"
43                         "}\n" };
44 
45     const char *c3[] = {
46         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
47         "__kernel void math_kernel",
48         sizeNames[vectorSize],
49         "( __global double* out, __global double* in, __global double* in2)\n"
50         "{\n"
51         "   size_t i = get_global_id(0);\n"
52         "   if( i + 1 < get_global_size(0) )\n"
53         "   {\n"
54         "       double3 d0 = vload3( 0, in + 3 * i );\n"
55         "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
56         "       d0 = ",
57         name,
58         "( d0, d1 );\n"
59         "       vstore3( d0, 0, out + 3*i );\n"
60         "   }\n"
61         "   else\n"
62         "   {\n"
63         "       size_t parity = i & 1;   // Figure out how many elements are "
64         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65         "buffer size \n"
66         "       double3 d0;\n"
67         "       double3 d1;\n"
68         "       switch( parity )\n"
69         "       {\n"
70         "           case 1:\n"
71         "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
72         "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
73         "               break;\n"
74         "           case 0:\n"
75         "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
76         "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
77         "               break;\n"
78         "       }\n"
79         "       d0 = ",
80         name,
81         "( d0, d1 );\n"
82         "       switch( parity )\n"
83         "       {\n"
84         "           case 0:\n"
85         "               out[3*i+1] = d0.y; \n"
86         "               // fall through\n"
87         "           case 1:\n"
88         "               out[3*i] = d0.x; \n"
89         "               break;\n"
90         "       }\n"
91         "   }\n"
92         "}\n"
93     };
94 
95     const char **kern = c;
96     size_t kernSize = sizeof(c) / sizeof(c[0]);
97 
98     if (sizeValues[vectorSize] == 3)
99     {
100         kern = c3;
101         kernSize = sizeof(c3) / sizeof(c3[0]);
102     }
103 
104     char testName[32];
105     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106              sizeNames[vectorSize]);
107 
108     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
109                        relaxedMode);
110 }
111 
112 typedef struct BuildKernelInfo
113 {
114     cl_uint offset; // the first vector size to build
115     cl_uint kernel_count;
116     cl_kernel **kernels;
117     cl_program *programs;
118     const char *nameInCode;
119     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
120 } BuildKernelInfo;
121 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)122 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
123 {
124     BuildKernelInfo *info = (BuildKernelInfo *)p;
125     cl_uint i = info->offset + job_id;
126     return BuildKernel(info->nameInCode, i, info->kernel_count,
127                        info->kernels[i], info->programs + i, info->relaxedMode);
128 }
129 
130 // Thread specific data for a worker thread
131 typedef struct ThreadInfo
132 {
133     cl_mem inBuf; // input buffer for the thread
134     cl_mem inBuf2; // input buffer for the thread
135     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
136     float maxError; // max error value. Init to 0.
137     double
138         maxErrorValue; // position of the max error value (param 1).  Init to 0.
139     double maxErrorValue2; // position of the max error value (param 2).  Init
140                            // to 0.
141     MTdata d;
142     cl_command_queue tQueue; // per thread command queue to improve performance
143 } ThreadInfo;
144 
145 typedef struct TestInfo
146 {
147     size_t subBufferSize; // Size of the sub-buffer in elements
148     const Func *f; // A pointer to the function info
149     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
150     cl_kernel
151         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
152                                // worker thread:  k[vector_size][thread_id]
153     ThreadInfo *
154         tinfo; // An array of thread specific information for each worker thread
155     cl_uint threadCount; // Number of worker threads
156     cl_uint jobCount; // Number of jobs
157     cl_uint step; // step between each chunk and the next.
158     cl_uint scale; // stride between individual test values
159     float ulps; // max_allowed ulps
160     int ftz; // non-zero if running in flush to zero mode
161 
162     int isFDim;
163     int skipNanInf;
164     int isNextafter;
165     bool relaxedMode; // True if test is running in relaxed mode, false
166                       // otherwise.
167 } TestInfo;
168 
169 // A table of more difficult cases to get right
170 static const double specialValues[] = {
171     -NAN,
172     -INFINITY,
173     -DBL_MAX,
174     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
175     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
176     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
177     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
178     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
179     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
180     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
181     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
182     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
183     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
184     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
185     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
186     -1000.0,
187     -100.0,
188     -4.0,
189     -3.5,
190     -3.0,
191     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
192     -2.5,
193     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
194     -2.0,
195     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
196     -1.5,
197     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
198     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
199     -1.0,
200     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
201     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
202     -0.5,
203     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
204     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
205     -0.25,
206     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
207     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
208     -DBL_MIN,
209     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
210     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
211     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
212     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
213     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
214     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
215     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
216     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
217     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
218     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
219     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
220     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
221     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
222     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
223     -0.0,
224 
225     +NAN,
226     +INFINITY,
227     +DBL_MAX,
228     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
229     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
230     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
231     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
232     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
233     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
234     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
235     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
236     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
237     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
238     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
239     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
240     +1000.0,
241     +100.0,
242     +4.0,
243     +3.5,
244     +3.0,
245     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
246     +2.5,
247     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
248     +2.0,
249     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
250     +1.5,
251     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
252     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
253     +1.0,
254     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
255     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
256     +0.5,
257     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
258     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
259     +0.25,
260     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
261     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
262     +DBL_MIN,
263     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
264     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
265     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
266     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
267     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
268     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
269     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
270     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
271     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
272     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
273     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
274     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
275     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
276     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
277     +0.0,
278 };
279 
280 static size_t specialValuesCount =
281     sizeof(specialValues) / sizeof(specialValues[0]);
282 
283 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
284 
TestFunc_Double_Double_Double(const Func * f,MTdata d,bool relaxedMode)285 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
286 {
287     TestInfo test_info;
288     cl_int error;
289     float maxError = 0.0f;
290     double maxErrorVal = 0.0;
291     double maxErrorVal2 = 0.0;
292 
293     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
294 
295     // Init test_info
296     memset(&test_info, 0, sizeof(test_info));
297     test_info.threadCount = GetThreadCount();
298     test_info.subBufferSize = BUFFER_SIZE
299         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
300     test_info.scale = getTestScale(sizeof(cl_double));
301 
302     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
303     if (test_info.step / test_info.subBufferSize != test_info.scale)
304     {
305         // there was overflow
306         test_info.jobCount = 1;
307     }
308     else
309     {
310         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
311     }
312 
313     test_info.f = f;
314     test_info.ulps = f->double_ulps;
315     test_info.ftz = f->ftz || gForceFTZ;
316 
317     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
318     test_info.skipNanInf = 0;
319     test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
320 
321     // cl_kernels aren't thread safe, so we make one for each vector size for
322     // every thread
323     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
324     {
325         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
326         test_info.k[i] = (cl_kernel *)malloc(array_size);
327         if (NULL == test_info.k[i])
328         {
329             vlog_error("Error: Unable to allocate storage for kernels!\n");
330             error = CL_OUT_OF_HOST_MEMORY;
331             goto exit;
332         }
333         memset(test_info.k[i], 0, array_size);
334     }
335     test_info.tinfo =
336         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
337     if (NULL == test_info.tinfo)
338     {
339         vlog_error(
340             "Error: Unable to allocate storage for thread specific data.\n");
341         error = CL_OUT_OF_HOST_MEMORY;
342         goto exit;
343     }
344     memset(test_info.tinfo, 0,
345            test_info.threadCount * sizeof(*test_info.tinfo));
346     for (cl_uint i = 0; i < test_info.threadCount; i++)
347     {
348         cl_buffer_region region = {
349             i * test_info.subBufferSize * sizeof(cl_double),
350             test_info.subBufferSize * sizeof(cl_double)
351         };
352         test_info.tinfo[i].inBuf =
353             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
354                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
355         if (error || NULL == test_info.tinfo[i].inBuf)
356         {
357             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
358                        "region {%zd, %zd}\n",
359                        region.origin, region.size);
360             goto exit;
361         }
362         test_info.tinfo[i].inBuf2 =
363             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
364                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
365         if (error || NULL == test_info.tinfo[i].inBuf2)
366         {
367             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
368                        "region {%zd, %zd}\n",
369                        region.origin, region.size);
370             goto exit;
371         }
372 
373         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
374         {
375             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
376                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
377                 &region, &error);
378             if (error || NULL == test_info.tinfo[i].outBuf[j])
379             {
380                 vlog_error("Error: Unable to create sub-buffer of "
381                            "gOutBuffer[%d] for region {%zd, %zd}\n",
382                            (int)j, region.origin, region.size);
383                 goto exit;
384             }
385         }
386         test_info.tinfo[i].tQueue =
387             clCreateCommandQueue(gContext, gDevice, 0, &error);
388         if (NULL == test_info.tinfo[i].tQueue || error)
389         {
390             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
391             goto exit;
392         }
393 
394         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
395     }
396 
397     // Init the kernels
398     {
399         BuildKernelInfo build_info = {
400             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
401             test_info.programs,  f->nameInCode,         relaxedMode
402         };
403         if ((error = ThreadPool_Do(BuildKernelFn,
404                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
405                                    &build_info)))
406             goto exit;
407     }
408 
409     // Run the kernels
410     if (!gSkipCorrectnessTesting)
411     {
412         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
413 
414         // Accumulate the arithmetic errors
415         for (cl_uint i = 0; i < test_info.threadCount; i++)
416         {
417             if (test_info.tinfo[i].maxError > maxError)
418             {
419                 maxError = test_info.tinfo[i].maxError;
420                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
421                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
422             }
423         }
424 
425         if (error) goto exit;
426 
427         if (gWimpyMode)
428             vlog("Wimp pass");
429         else
430             vlog("passed");
431 
432         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
433     }
434 
435     vlog("\n");
436 
437 exit:
438     // Release
439     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
440     {
441         clReleaseProgram(test_info.programs[i]);
442         if (test_info.k[i])
443         {
444             for (cl_uint j = 0; j < test_info.threadCount; j++)
445                 clReleaseKernel(test_info.k[i][j]);
446 
447             free(test_info.k[i]);
448         }
449     }
450     if (test_info.tinfo)
451     {
452         for (cl_uint i = 0; i < test_info.threadCount; i++)
453         {
454             free_mtdata(test_info.tinfo[i].d);
455             clReleaseMemObject(test_info.tinfo[i].inBuf);
456             clReleaseMemObject(test_info.tinfo[i].inBuf2);
457             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
458                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
459             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
460         }
461 
462         free(test_info.tinfo);
463     }
464 
465     return error;
466 }
467 
Test(cl_uint job_id,cl_uint thread_id,void * data)468 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
469 {
470     const TestInfo *job = (const TestInfo *)data;
471     size_t buffer_elements = job->subBufferSize;
472     size_t buffer_size = buffer_elements * sizeof(cl_double);
473     cl_uint base = job_id * (cl_uint)job->step;
474     ThreadInfo *tinfo = job->tinfo + thread_id;
475     float ulps = job->ulps;
476     dptr func = job->f->dfunc;
477     int ftz = job->ftz;
478     MTdata d = tinfo->d;
479     cl_int error;
480     const char *name = job->f->name;
481 
482     int isNextafter = job->isNextafter;
483     cl_ulong *t;
484     cl_double *r;
485     cl_double *s;
486     cl_double *s2;
487 
488     Force64BitFPUPrecision();
489 
490     // start the map of the output arrays
491     cl_event e[VECTOR_SIZE_COUNT];
492     cl_ulong *out[VECTOR_SIZE_COUNT];
493     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
494     {
495         out[j] = (cl_ulong *)clEnqueueMapBuffer(
496             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
497             buffer_size, 0, NULL, e + j, &error);
498         if (error || NULL == out[j])
499         {
500             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
501                        error);
502             return error;
503         }
504     }
505 
506     // Get that moving
507     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
508 
509     // Init input array
510     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
511     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
512     cl_uint idx = 0;
513     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
514     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
515 
516     if (job_id <= (cl_uint)lastSpecialJobIndex)
517     { // test edge cases
518         cl_double *fp = (cl_double *)p;
519         cl_double *fp2 = (cl_double *)p2;
520         uint32_t x, y;
521 
522         x = (job_id * buffer_elements) % specialValuesCount;
523         y = (job_id * buffer_elements) / specialValuesCount;
524 
525         for (; idx < buffer_elements; idx++)
526         {
527             fp[idx] = specialValues[x];
528             fp2[idx] = specialValues[y];
529             if (++x >= specialValuesCount)
530             {
531                 x = 0;
532                 y++;
533                 if (y >= specialValuesCount) break;
534             }
535         }
536     }
537 
538     // Init any remaining values.
539     for (; idx < buffer_elements; idx++)
540     {
541         p[idx] = genrand_int64(d);
542         p2[idx] = genrand_int64(d);
543     }
544 
545     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
546                                       buffer_size, p, 0, NULL, NULL)))
547     {
548         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
549         goto exit;
550     }
551 
552     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
553                                       buffer_size, p2, 0, NULL, NULL)))
554     {
555         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
556         goto exit;
557     }
558 
559     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
560     {
561         // Wait for the map to finish
562         if ((error = clWaitForEvents(1, e + j)))
563         {
564             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
565             goto exit;
566         }
567         if ((error = clReleaseEvent(e[j])))
568         {
569             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
570             goto exit;
571         }
572 
573         // Fill the result buffer with garbage, so that old results don't carry
574         // over
575         uint32_t pattern = 0xffffdead;
576         memset_pattern4(out[j], &pattern, buffer_size);
577         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
578                                              out[j], 0, NULL, NULL)))
579         {
580             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
581             goto exit;
582         }
583 
584         // run the kernel
585         size_t vectorCount =
586             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
587         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
588                                                  // own copy of the cl_kernel
589         cl_program program = job->programs[j];
590 
591         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
592                                     &tinfo->outBuf[j])))
593         {
594             LogBuildError(program);
595             return error;
596         }
597         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
598                                     &tinfo->inBuf)))
599         {
600             LogBuildError(program);
601             return error;
602         }
603         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
604                                     &tinfo->inBuf2)))
605         {
606             LogBuildError(program);
607             return error;
608         }
609 
610         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
611                                             &vectorCount, NULL, 0, NULL, NULL)))
612         {
613             vlog_error("FAILED -- could not execute kernel\n");
614             goto exit;
615         }
616     }
617 
618     // Get that moving
619     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
620 
621     if (gSkipCorrectnessTesting) return CL_SUCCESS;
622 
623     // Calculate the correctly rounded reference result
624     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
625     s = (cl_double *)gIn + thread_id * buffer_elements;
626     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
627     for (size_t j = 0; j < buffer_elements; j++)
628         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
629 
630     // Read the data back -- no need to wait for the first N-1 buffers but wait
631     // for the last buffer. This is an in order queue.
632     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
633     {
634         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
635         out[j] = (cl_ulong *)clEnqueueMapBuffer(
636             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
637             buffer_size, 0, NULL, NULL, &error);
638         if (error || NULL == out[j])
639         {
640             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
641                        error);
642             goto exit;
643         }
644     }
645 
646     // Verify data
647     t = (cl_ulong *)r;
648     for (size_t j = 0; j < buffer_elements; j++)
649     {
650         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
651         {
652             cl_ulong *q = out[k];
653 
654             // If we aren't getting the correctly rounded result
655             if (t[j] != q[j])
656             {
657                 cl_double test = ((cl_double *)q)[j];
658                 long double correct = func.f_ff(s[j], s2[j]);
659                 float err = Bruteforce_Ulp_Error_Double(test, correct);
660                 int fail = !(fabsf(err) <= ulps);
661 
662                 if (fail && ftz)
663                 {
664                     // retry per section 6.5.3.2
665                     if (IsDoubleResultSubnormal(correct, ulps))
666                     {
667                         fail = fail && (test != 0.0f);
668                         if (!fail) err = 0.0f;
669                     }
670 
671                     // nextafter on FTZ platforms may return the smallest
672                     // normal float (2^-126) given a denormal or a zero
673                     // as the first argument. The rationale here is that
674                     // nextafter flushes the argument to zero and then
675                     // returns the next representable number in the
676                     // direction of the second argument, and since
677                     // denorms are considered as zero, the smallest
678                     // normal number is the next representable number.
679                     // In which case, it should have the same sign as the
680                     // second argument.
681                     if (isNextafter)
682                     {
683                         if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
684                         {
685                             cl_double value = copysign(twoToMinus1022, s2[j]);
686                             fail = fail && (test != value);
687                             if (!fail) err = 0.0f;
688                         }
689                     }
690                     else
691                     {
692                         // retry per section 6.5.3.3
693                         if (IsDoubleSubnormal(s[j]))
694                         {
695                             long double correct2 = func.f_ff(0.0, s2[j]);
696                             long double correct3 = func.f_ff(-0.0, s2[j]);
697                             float err2 =
698                                 Bruteforce_Ulp_Error_Double(test, correct2);
699                             float err3 =
700                                 Bruteforce_Ulp_Error_Double(test, correct3);
701                             fail = fail
702                                 && ((!(fabsf(err2) <= ulps))
703                                     && (!(fabsf(err3) <= ulps)));
704                             if (fabsf(err2) < fabsf(err)) err = err2;
705                             if (fabsf(err3) < fabsf(err)) err = err3;
706 
707                             // retry per section 6.5.3.4
708                             if (IsDoubleResultSubnormal(correct2, ulps)
709                                 || IsDoubleResultSubnormal(correct3, ulps))
710                             {
711                                 fail = fail && (test != 0.0f);
712                                 if (!fail) err = 0.0f;
713                             }
714 
715                             // try with both args as zero
716                             if (IsDoubleSubnormal(s2[j]))
717                             {
718                                 correct2 = func.f_ff(0.0, 0.0);
719                                 correct3 = func.f_ff(-0.0, 0.0);
720                                 long double correct4 = func.f_ff(0.0, -0.0);
721                                 long double correct5 = func.f_ff(-0.0, -0.0);
722                                 err2 =
723                                     Bruteforce_Ulp_Error_Double(test, correct2);
724                                 err3 =
725                                     Bruteforce_Ulp_Error_Double(test, correct3);
726                                 float err4 =
727                                     Bruteforce_Ulp_Error_Double(test, correct4);
728                                 float err5 =
729                                     Bruteforce_Ulp_Error_Double(test, correct5);
730                                 fail = fail
731                                     && ((!(fabsf(err2) <= ulps))
732                                         && (!(fabsf(err3) <= ulps))
733                                         && (!(fabsf(err4) <= ulps))
734                                         && (!(fabsf(err5) <= ulps)));
735                                 if (fabsf(err2) < fabsf(err)) err = err2;
736                                 if (fabsf(err3) < fabsf(err)) err = err3;
737                                 if (fabsf(err4) < fabsf(err)) err = err4;
738                                 if (fabsf(err5) < fabsf(err)) err = err5;
739 
740                                 // retry per section 6.5.3.4
741                                 if (IsDoubleResultSubnormal(correct2, ulps)
742                                     || IsDoubleResultSubnormal(correct3, ulps)
743                                     || IsDoubleResultSubnormal(correct4, ulps)
744                                     || IsDoubleResultSubnormal(correct5, ulps))
745                                 {
746                                     fail = fail && (test != 0.0f);
747                                     if (!fail) err = 0.0f;
748                                 }
749                             }
750                         }
751                         else if (IsDoubleSubnormal(s2[j]))
752                         {
753                             long double correct2 = func.f_ff(s[j], 0.0);
754                             long double correct3 = func.f_ff(s[j], -0.0);
755                             float err2 =
756                                 Bruteforce_Ulp_Error_Double(test, correct2);
757                             float err3 =
758                                 Bruteforce_Ulp_Error_Double(test, correct3);
759                             fail = fail
760                                 && ((!(fabsf(err2) <= ulps))
761                                     && (!(fabsf(err3) <= ulps)));
762                             if (fabsf(err2) < fabsf(err)) err = err2;
763                             if (fabsf(err3) < fabsf(err)) err = err3;
764 
765                             // retry per section 6.5.3.4
766                             if (IsDoubleResultSubnormal(correct2, ulps)
767                                 || IsDoubleResultSubnormal(correct3, ulps))
768                             {
769                                 fail = fail && (test != 0.0f);
770                                 if (!fail) err = 0.0f;
771                             }
772                         }
773                     }
774                 }
775 
776                 if (fabsf(err) > tinfo->maxError)
777                 {
778                     tinfo->maxError = fabsf(err);
779                     tinfo->maxErrorValue = s[j];
780                     tinfo->maxErrorValue2 = s2[j];
781                 }
782                 if (fail)
783                 {
784                     vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
785                                "%.13la}: *%.13la vs. %.13la\n",
786                                name, sizeNames[k], err, s[j], s2[j], r[j],
787                                test);
788                     error = -1;
789                     goto exit;
790                 }
791             }
792         }
793     }
794 
795     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
796     {
797         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
798                                              out[j], 0, NULL, NULL)))
799         {
800             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
801                        j, error);
802             return error;
803         }
804     }
805 
806     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
807 
808 
809     if (0 == (base & 0x0fffffff))
810     {
811         if (gVerboseBruteForce)
812         {
813             vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
814                  "ThreadCount:%2u\n",
815                  base, job->step, job->scale, buffer_elements, job->ulps,
816                  job->threadCount);
817         }
818         else
819         {
820             vlog(".");
821         }
822         fflush(stdout);
823     }
824 
825 exit:
826     return error;
827 }
828