1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
BuildKernel(const char * operator_symbol,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *operator_symbol, int vectorSize,
24                        cl_uint kernel_count, cl_kernel *k, cl_program *p,
25                        bool relaxedMode)
26 {
27     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
28                         "__kernel void math_kernel",
29                         sizeNames[vectorSize],
30                         "( __global double",
31                         sizeNames[vectorSize],
32                         "* out, __global double",
33                         sizeNames[vectorSize],
34                         "* in1, __global double",
35                         sizeNames[vectorSize],
36                         "* in2 )\n"
37                         "{\n"
38                         "   size_t i = get_global_id(0);\n"
39                         "   out[i] = in1[i] ",
40                         operator_symbol,
41                         " in2[i];\n"
42                         "}\n" };
43 
44     const char *c3[] = {
45         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
46         "__kernel void math_kernel",
47         sizeNames[vectorSize],
48         "( __global double* out, __global double* in, __global double* in2)\n"
49         "{\n"
50         "   size_t i = get_global_id(0);\n"
51         "   if( i + 1 < get_global_size(0) )\n"
52         "   {\n"
53         "       double3 d0 = vload3( 0, in + 3 * i );\n"
54         "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
55         "       d0 = d0 ",
56         operator_symbol,
57         " d1;\n"
58         "       vstore3( d0, 0, out + 3*i );\n"
59         "   }\n"
60         "   else\n"
61         "   {\n"
62         "       size_t parity = i & 1;   // Figure out how many elements are "
63         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
64         "buffer size \n"
65         "       double3 d0;\n"
66         "       double3 d1;\n"
67         "       switch( parity )\n"
68         "       {\n"
69         "           case 1:\n"
70         "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
71         "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
72         "               break;\n"
73         "           case 0:\n"
74         "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
75         "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
76         "               break;\n"
77         "       }\n"
78         "       d0 = d0 ",
79         operator_symbol,
80         " d1;\n"
81         "       switch( parity )\n"
82         "       {\n"
83         "           case 0:\n"
84         "               out[3*i+1] = d0.y; \n"
85         "               // fall through\n"
86         "           case 1:\n"
87         "               out[3*i] = d0.x; \n"
88         "               break;\n"
89         "       }\n"
90         "   }\n"
91         "}\n"
92     };
93 
94     const char **kern = c;
95     size_t kernSize = sizeof(c) / sizeof(c[0]);
96 
97     if (sizeValues[vectorSize] == 3)
98     {
99         kern = c3;
100         kernSize = sizeof(c3) / sizeof(c3[0]);
101     }
102 
103     char testName[32];
104     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
105              sizeNames[vectorSize]);
106 
107     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
108                        relaxedMode);
109 }
110 
111 typedef struct BuildKernelInfo
112 {
113     cl_uint offset; // the first vector size to build
114     cl_uint kernel_count;
115     cl_kernel **kernels;
116     cl_program *programs;
117     const char *operator_symbol;
118     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
119 } BuildKernelInfo;
120 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)121 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
122 {
123     BuildKernelInfo *info = (BuildKernelInfo *)p;
124     cl_uint i = info->offset + job_id;
125     return BuildKernel(info->operator_symbol, i, info->kernel_count,
126                        info->kernels[i], info->programs + i, info->relaxedMode);
127 }
128 
129 // Thread specific data for a worker thread
130 typedef struct ThreadInfo
131 {
132     cl_mem inBuf; // input buffer for the thread
133     cl_mem inBuf2; // input buffer for the thread
134     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
135     float maxError; // max error value. Init to 0.
136     double
137         maxErrorValue; // position of the max error value (param 1).  Init to 0.
138     double maxErrorValue2; // position of the max error value (param 2).  Init
139                            // to 0.
140     MTdata d;
141     cl_command_queue tQueue; // per thread command queue to improve performance
142 } ThreadInfo;
143 
144 typedef struct TestInfo
145 {
146     size_t subBufferSize; // Size of the sub-buffer in elements
147     const Func *f; // A pointer to the function info
148     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
149     cl_kernel
150         *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
151                                // worker thread:  k[vector_size][thread_id]
152     ThreadInfo *
153         tinfo; // An array of thread specific information for each worker thread
154     cl_uint threadCount; // Number of worker threads
155     cl_uint jobCount; // Number of jobs
156     cl_uint step; // step between each chunk and the next.
157     cl_uint scale; // stride between individual test values
158     float ulps; // max_allowed ulps
159     int ftz; // non-zero if running in flush to zero mode
160     bool relaxedMode; // True if the test is being run in relaxed mode, false
161                       // otherwise.
162 
163     // no special fields
164 } TestInfo;
165 
166 // A table of more difficult cases to get right
167 static const double specialValues[] = {
168     -NAN,
169     -INFINITY,
170     -DBL_MAX,
171     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
172     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
173     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
174     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
175     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
176     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
177     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
178     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
179     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
180     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
181     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
182     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
183     -1000.,
184     -100.,
185     -4.0,
186     -3.5,
187     -3.0,
188     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
189     -2.5,
190     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
191     -2.0,
192     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
193     -1.5,
194     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
195     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
196     -1.0,
197     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
198     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
199     -0.5,
200     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
201     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
202     -0.25,
203     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
204     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
205     -DBL_MIN,
206     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
207     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
208     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
209     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
210     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
211     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
212     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
213     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
214     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
215     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
216     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
217     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
218     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
219     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
220     -0.0,
221 
222     +NAN,
223     +INFINITY,
224     +DBL_MAX,
225     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
226     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
227     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
228     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
229     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
230     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
231     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
232     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
233     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
234     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
235     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
236     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
237     +1000.0,
238     +100.0,
239     +4.0,
240     +3.5,
241     +3.0,
242     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
243     +2.5,
244     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
245     +2.0,
246     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
247     +1.5,
248     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
249     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
250     +1.0,
251     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
252     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
253     +0.5,
254     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
255     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
256     +0.25,
257     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
258     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
259     +DBL_MIN,
260     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
261     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
262     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
263     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
264     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
265     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
266     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
267     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
268     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
269     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
270     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
271     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
272     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
273     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
274     +0.0,
275 };
276 
277 static const size_t specialValuesCount =
278     sizeof(specialValues) / sizeof(specialValues[0]);
279 
280 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
281 
TestFunc_Double_Double_Double_Operator(const Func * f,MTdata d,bool relaxedMode)282 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
283                                            bool relaxedMode)
284 {
285     TestInfo test_info;
286     cl_int error;
287     float maxError = 0.0f;
288     double maxErrorVal = 0.0;
289     double maxErrorVal2 = 0.0;
290 
291     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
292 
293     // Init test_info
294     memset(&test_info, 0, sizeof(test_info));
295     test_info.threadCount = GetThreadCount();
296     test_info.subBufferSize = BUFFER_SIZE
297         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
298     test_info.scale = getTestScale(sizeof(cl_double));
299 
300     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
301     if (test_info.step / test_info.subBufferSize != test_info.scale)
302     {
303         // there was overflow
304         test_info.jobCount = 1;
305     }
306     else
307     {
308         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
309     }
310 
311     test_info.f = f;
312     test_info.ulps = f->double_ulps;
313     test_info.ftz = f->ftz || gForceFTZ;
314 
315     // cl_kernels aren't thread safe, so we make one for each vector size for
316     // every thread
317     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
318     {
319         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
320         test_info.k[i] = (cl_kernel *)malloc(array_size);
321         if (NULL == test_info.k[i])
322         {
323             vlog_error("Error: Unable to allocate storage for kernels!\n");
324             error = CL_OUT_OF_HOST_MEMORY;
325             goto exit;
326         }
327         memset(test_info.k[i], 0, array_size);
328     }
329     test_info.tinfo =
330         (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
331     if (NULL == test_info.tinfo)
332     {
333         vlog_error(
334             "Error: Unable to allocate storage for thread specific data.\n");
335         error = CL_OUT_OF_HOST_MEMORY;
336         goto exit;
337     }
338     memset(test_info.tinfo, 0,
339            test_info.threadCount * sizeof(*test_info.tinfo));
340     for (cl_uint i = 0; i < test_info.threadCount; i++)
341     {
342         cl_buffer_region region = {
343             i * test_info.subBufferSize * sizeof(cl_double),
344             test_info.subBufferSize * sizeof(cl_double)
345         };
346         test_info.tinfo[i].inBuf =
347             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
348                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
349         if (error || NULL == test_info.tinfo[i].inBuf)
350         {
351             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
352                        "region {%zd, %zd}\n",
353                        region.origin, region.size);
354             goto exit;
355         }
356         test_info.tinfo[i].inBuf2 =
357             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
358                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
359         if (error || NULL == test_info.tinfo[i].inBuf2)
360         {
361             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
362                        "region {%zd, %zd}\n",
363                        region.origin, region.size);
364             goto exit;
365         }
366 
367         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
368         {
369             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
370                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
371                 &region, &error);
372             if (error || NULL == test_info.tinfo[i].outBuf[j])
373             {
374                 vlog_error("Error: Unable to create sub-buffer of "
375                            "gOutBuffer[%d] for region {%zd, %zd}\n",
376                            (int)j, region.origin, region.size);
377                 goto exit;
378             }
379         }
380         test_info.tinfo[i].tQueue =
381             clCreateCommandQueue(gContext, gDevice, 0, &error);
382         if (NULL == test_info.tinfo[i].tQueue || error)
383         {
384             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
385             goto exit;
386         }
387 
388         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
389     }
390 
391     // Init the kernels
392     {
393         BuildKernelInfo build_info = {
394             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
395             test_info.programs,  f->nameInCode,         relaxedMode
396         };
397         if ((error = ThreadPool_Do(BuildKernelFn,
398                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
399                                    &build_info)))
400             goto exit;
401     }
402 
403     // Run the kernels
404     if (!gSkipCorrectnessTesting)
405     {
406         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
407 
408         // Accumulate the arithmetic errors
409         for (cl_uint i = 0; i < test_info.threadCount; i++)
410         {
411             if (test_info.tinfo[i].maxError > maxError)
412             {
413                 maxError = test_info.tinfo[i].maxError;
414                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
415                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
416             }
417         }
418 
419         if (error) goto exit;
420 
421         if (gWimpyMode)
422             vlog("Wimp pass");
423         else
424             vlog("passed");
425 
426         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
427     }
428 
429     vlog("\n");
430 
431 exit:
432     // Release
433     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
434     {
435         clReleaseProgram(test_info.programs[i]);
436         if (test_info.k[i])
437         {
438             for (cl_uint j = 0; j < test_info.threadCount; j++)
439                 clReleaseKernel(test_info.k[i][j]);
440 
441             free(test_info.k[i]);
442         }
443     }
444     if (test_info.tinfo)
445     {
446         for (cl_uint i = 0; i < test_info.threadCount; i++)
447         {
448             free_mtdata(test_info.tinfo[i].d);
449             clReleaseMemObject(test_info.tinfo[i].inBuf);
450             clReleaseMemObject(test_info.tinfo[i].inBuf2);
451             for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
452                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
453             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
454         }
455 
456         free(test_info.tinfo);
457     }
458 
459     return error;
460 }
461 
Test(cl_uint job_id,cl_uint thread_id,void * data)462 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
463 {
464     const TestInfo *job = (const TestInfo *)data;
465     size_t buffer_elements = job->subBufferSize;
466     size_t buffer_size = buffer_elements * sizeof(cl_double);
467     cl_uint base = job_id * (cl_uint)job->step;
468     ThreadInfo *tinfo = job->tinfo + thread_id;
469     float ulps = job->ulps;
470     dptr func = job->f->dfunc;
471     int ftz = job->ftz;
472     MTdata d = tinfo->d;
473     cl_int error;
474     const char *name = job->f->name;
475     cl_ulong *t;
476     cl_double *r;
477     cl_double *s;
478     cl_double *s2;
479 
480     Force64BitFPUPrecision();
481 
482     // start the map of the output arrays
483     cl_event e[VECTOR_SIZE_COUNT];
484     cl_ulong *out[VECTOR_SIZE_COUNT];
485     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
486     {
487         out[j] = (cl_ulong *)clEnqueueMapBuffer(
488             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
489             buffer_size, 0, NULL, e + j, &error);
490         if (error || NULL == out[j])
491         {
492             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
493                        error);
494             return error;
495         }
496     }
497 
498     // Get that moving
499     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
500 
501     // Init input array
502     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
503     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
504     cl_uint idx = 0;
505     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
506     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
507 
508     if (job_id <= (cl_uint)lastSpecialJobIndex)
509     { // test edge cases
510         cl_double *fp = (cl_double *)p;
511         cl_double *fp2 = (cl_double *)p2;
512         uint32_t x, y;
513 
514         x = (job_id * buffer_elements) % specialValuesCount;
515         y = (job_id * buffer_elements) / specialValuesCount;
516 
517         for (; idx < buffer_elements; idx++)
518         {
519             fp[idx] = specialValues[x];
520             fp2[idx] = specialValues[y];
521             if (++x >= specialValuesCount)
522             {
523                 x = 0;
524                 y++;
525                 if (y >= specialValuesCount) break;
526             }
527         }
528     }
529 
530     // Init any remaining values.
531     for (; idx < buffer_elements; idx++)
532     {
533         p[idx] = genrand_int64(d);
534         p2[idx] = genrand_int64(d);
535     }
536 
537     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
538                                       buffer_size, p, 0, NULL, NULL)))
539     {
540         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
541         goto exit;
542     }
543 
544     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
545                                       buffer_size, p2, 0, NULL, NULL)))
546     {
547         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
548         goto exit;
549     }
550 
551     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
552     {
553         // Wait for the map to finish
554         if ((error = clWaitForEvents(1, e + j)))
555         {
556             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
557             goto exit;
558         }
559         if ((error = clReleaseEvent(e[j])))
560         {
561             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
562             goto exit;
563         }
564 
565         // Fill the result buffer with garbage, so that old results don't carry
566         // over
567         uint32_t pattern = 0xffffdead;
568         memset_pattern4(out[j], &pattern, buffer_size);
569         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
570                                              out[j], 0, NULL, NULL)))
571         {
572             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
573             goto exit;
574         }
575 
576         // run the kernel
577         size_t vectorCount =
578             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
579         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
580                                                  // own copy of the cl_kernel
581         cl_program program = job->programs[j];
582 
583         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
584                                     &tinfo->outBuf[j])))
585         {
586             LogBuildError(program);
587             return error;
588         }
589         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
590                                     &tinfo->inBuf)))
591         {
592             LogBuildError(program);
593             return error;
594         }
595         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
596                                     &tinfo->inBuf2)))
597         {
598             LogBuildError(program);
599             return error;
600         }
601 
602         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
603                                             &vectorCount, NULL, 0, NULL, NULL)))
604         {
605             vlog_error("FAILED -- could not execute kernel\n");
606             goto exit;
607         }
608     }
609 
610     // Get that moving
611     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
612 
613     if (gSkipCorrectnessTesting) return CL_SUCCESS;
614 
615     // Calculate the correctly rounded reference result
616     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
617     s = (cl_double *)gIn + thread_id * buffer_elements;
618     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
619     for (size_t j = 0; j < buffer_elements; j++)
620         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
621 
622     // Read the data back -- no need to wait for the first N-1 buffers but wait
623     // for the last buffer. This is an in order queue.
624     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
625     {
626         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
627         out[j] = (cl_ulong *)clEnqueueMapBuffer(
628             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
629             buffer_size, 0, NULL, NULL, &error);
630         if (error || NULL == out[j])
631         {
632             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
633                        error);
634             goto exit;
635         }
636     }
637 
638     // Verify data
639     t = (cl_ulong *)r;
640     for (size_t j = 0; j < buffer_elements; j++)
641     {
642         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
643         {
644             cl_ulong *q = out[k];
645 
646             // If we aren't getting the correctly rounded result
647             if (t[j] != q[j])
648             {
649                 cl_double test = ((cl_double *)q)[j];
650                 long double correct = func.f_ff(s[j], s2[j]);
651                 float err = Bruteforce_Ulp_Error_Double(test, correct);
652                 int fail = !(fabsf(err) <= ulps);
653 
654                 if (fail && ftz)
655                 {
656                     // retry per section 6.5.3.2
657                     if (IsDoubleResultSubnormal(correct, ulps))
658                     {
659                         fail = fail && (test != 0.0f);
660                         if (!fail) err = 0.0f;
661                     }
662 
663 
664                     // retry per section 6.5.3.3
665                     if (IsDoubleSubnormal(s[j]))
666                     {
667                         long double correct2 = func.f_ff(0.0, s2[j]);
668                         long double correct3 = func.f_ff(-0.0, s2[j]);
669                         float err2 =
670                             Bruteforce_Ulp_Error_Double(test, correct2);
671                         float err3 =
672                             Bruteforce_Ulp_Error_Double(test, correct3);
673                         fail = fail
674                             && ((!(fabsf(err2) <= ulps))
675                                 && (!(fabsf(err3) <= ulps)));
676                         if (fabsf(err2) < fabsf(err)) err = err2;
677                         if (fabsf(err3) < fabsf(err)) err = err3;
678 
679                         // retry per section 6.5.3.4
680                         if (IsDoubleResultSubnormal(correct2, ulps)
681                             || IsDoubleResultSubnormal(correct3, ulps))
682                         {
683                             fail = fail && (test != 0.0f);
684                             if (!fail) err = 0.0f;
685                         }
686 
687                         // try with both args as zero
688                         if (IsDoubleSubnormal(s2[j]))
689                         {
690                             correct2 = func.f_ff(0.0, 0.0);
691                             correct3 = func.f_ff(-0.0, 0.0);
692                             long double correct4 = func.f_ff(0.0, -0.0);
693                             long double correct5 = func.f_ff(-0.0, -0.0);
694                             err2 = Bruteforce_Ulp_Error_Double(test, correct2);
695                             err3 = Bruteforce_Ulp_Error_Double(test, correct3);
696                             float err4 =
697                                 Bruteforce_Ulp_Error_Double(test, correct4);
698                             float err5 =
699                                 Bruteforce_Ulp_Error_Double(test, correct5);
700                             fail = fail
701                                 && ((!(fabsf(err2) <= ulps))
702                                     && (!(fabsf(err3) <= ulps))
703                                     && (!(fabsf(err4) <= ulps))
704                                     && (!(fabsf(err5) <= ulps)));
705                             if (fabsf(err2) < fabsf(err)) err = err2;
706                             if (fabsf(err3) < fabsf(err)) err = err3;
707                             if (fabsf(err4) < fabsf(err)) err = err4;
708                             if (fabsf(err5) < fabsf(err)) err = err5;
709 
710                             // retry per section 6.5.3.4
711                             if (IsDoubleResultSubnormal(correct2, ulps)
712                                 || IsDoubleResultSubnormal(correct3, ulps)
713                                 || IsDoubleResultSubnormal(correct4, ulps)
714                                 || IsDoubleResultSubnormal(correct5, ulps))
715                             {
716                                 fail = fail && (test != 0.0f);
717                                 if (!fail) err = 0.0f;
718                             }
719                         }
720                     }
721                     else if (IsDoubleSubnormal(s2[j]))
722                     {
723                         long double correct2 = func.f_ff(s[j], 0.0);
724                         long double correct3 = func.f_ff(s[j], -0.0);
725                         float err2 =
726                             Bruteforce_Ulp_Error_Double(test, correct2);
727                         float err3 =
728                             Bruteforce_Ulp_Error_Double(test, correct3);
729                         fail = fail
730                             && ((!(fabsf(err2) <= ulps))
731                                 && (!(fabsf(err3) <= ulps)));
732                         if (fabsf(err2) < fabsf(err)) err = err2;
733                         if (fabsf(err3) < fabsf(err)) err = err3;
734 
735                         // retry per section 6.5.3.4
736                         if (IsDoubleResultSubnormal(correct2, ulps)
737                             || IsDoubleResultSubnormal(correct3, ulps))
738                         {
739                             fail = fail && (test != 0.0f);
740                             if (!fail) err = 0.0f;
741                         }
742                     }
743                 }
744 
745                 if (fabsf(err) > tinfo->maxError)
746                 {
747                     tinfo->maxError = fabsf(err);
748                     tinfo->maxErrorValue = s[j];
749                     tinfo->maxErrorValue2 = s2[j];
750                 }
751                 if (fail)
752                 {
753                     vlog_error(
754                         "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
755                         name, sizeNames[k], err, s[j], s2[j], r[j], test);
756                     error = -1;
757                     goto exit;
758                 }
759             }
760         }
761     }
762 
763     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
764     {
765         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
766                                              out[j], 0, NULL, NULL)))
767         {
768             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
769                        j, error);
770             return error;
771         }
772     }
773 
774     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
775 
776 
777     if (0 == (base & 0x0fffffff))
778     {
779         if (gVerboseBruteForce)
780         {
781             vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
782                  "ThreadCount:%2u\n",
783                  base, job->step, job->scale, buffer_elements, job->ulps,
784                  job->threadCount);
785         }
786         else
787         {
788             vlog(".");
789         }
790         fflush(stdout);
791     }
792 
793 exit:
794     return error;
795 }
796