1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
23 #define CORRECTLY_ROUNDED 0
24 #define FLUSHED 1
25 
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)26 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
27                        cl_program *p, bool relaxedMode)
28 {
29     const char *c[] = { "__kernel void math_kernel",
30                         sizeNames[vectorSize],
31                         "( __global float",
32                         sizeNames[vectorSize],
33                         "* out, __global float",
34                         sizeNames[vectorSize],
35                         "* in1, __global float",
36                         sizeNames[vectorSize],
37                         "* in2,  __global float",
38                         sizeNames[vectorSize],
39                         "* in3 )\n"
40                         "{\n"
41                         "   size_t i = get_global_id(0);\n"
42                         "   out[i] = ",
43                         name,
44                         "( in1[i], in2[i], in3[i] );\n"
45                         "}\n" };
46 
47     const char *c3[] = {
48         "__kernel void math_kernel",
49         sizeNames[vectorSize],
50         "( __global float* out, __global float* in, __global float* in2, "
51         "__global float* in3)\n"
52         "{\n"
53         "   size_t i = get_global_id(0);\n"
54         "   if( i + 1 < get_global_size(0) )\n"
55         "   {\n"
56         "       float3 f0 = vload3( 0, in + 3 * i );\n"
57         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
58         "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
59         "       f0 = ",
60         name,
61         "( f0, f1, f2 );\n"
62         "       vstore3( f0, 0, out + 3*i );\n"
63         "   }\n"
64         "   else\n"
65         "   {\n"
66         "       size_t parity = i & 1;   // Figure out how many elements are "
67         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
68         "buffer size \n"
69         "       float3 f0;\n"
70         "       float3 f1;\n"
71         "       float3 f2;\n"
72         "       switch( parity )\n"
73         "       {\n"
74         "           case 1:\n"
75         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
76         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
77         "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
78         "               break;\n"
79         "           case 0:\n"
80         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
81         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
82         "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
83         "               break;\n"
84         "       }\n"
85         "       f0 = ",
86         name,
87         "( f0, f1, f2 );\n"
88         "       switch( parity )\n"
89         "       {\n"
90         "           case 0:\n"
91         "               out[3*i+1] = f0.y; \n"
92         "               // fall through\n"
93         "           case 1:\n"
94         "               out[3*i] = f0.x; \n"
95         "               break;\n"
96         "       }\n"
97         "   }\n"
98         "}\n"
99     };
100 
101     const char **kern = c;
102     size_t kernSize = sizeof(c) / sizeof(c[0]);
103 
104     if (sizeValues[vectorSize] == 3)
105     {
106         kern = c3;
107         kernSize = sizeof(c3) / sizeof(c3[0]);
108     }
109 
110     char testName[32];
111     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
112              sizeNames[vectorSize]);
113 
114     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
115 }
116 
117 typedef struct BuildKernelInfo
118 {
119     cl_uint offset; // the first vector size to build
120     cl_kernel *kernels;
121     cl_program *programs;
122     const char *nameInCode;
123     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
124 } BuildKernelInfo;
125 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)126 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
127 {
128     BuildKernelInfo *info = (BuildKernelInfo *)p;
129     cl_uint i = info->offset + job_id;
130     return BuildKernel(info->nameInCode, i, info->kernels + i,
131                        info->programs + i, info->relaxedMode);
132 }
133 
134 // A table of more difficult cases to get right
135 static const float specialValues[] = {
136     -NAN,
137     -INFINITY,
138     -FLT_MAX,
139     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
140     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
141     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
142     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
143     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
144     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
145     -3.0f,
146     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
147     -2.5f,
148     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
149     -2.0f,
150     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
151     -1.75f,
152     -1.5f,
153     -1.25f,
154     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
155     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
156     MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24),
157     -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
158     -1.0f,
159     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
160     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
161     -FLT_MIN,
162     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
163     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
164     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
165     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
166     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
167     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
168     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
169     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
170     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
171     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
172     -0.0f,
173 
174     +NAN,
175     +INFINITY,
176     +FLT_MAX,
177     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
178     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
179     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
180     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
181     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
182     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
183     +3.0f,
184     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
185     2.5f,
186     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
187     +2.0f,
188     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
189     1.75f,
190     1.5f,
191     1.25f,
192     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
193     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
194     MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24),
195     +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
196     +1.0f,
197     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
198     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
199     +FLT_MIN,
200     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
201     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
202     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
203     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
204     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
205     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
206     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
207     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
208     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
209     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
210     +0.0f,
211 };
212 
213 static const size_t specialValuesCount =
214     sizeof(specialValues) / sizeof(specialValues[0]);
215 
TestFunc_Float_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)216 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
217 {
218     int error;
219 
220     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
221 
222     cl_program programs[VECTOR_SIZE_COUNT];
223     cl_kernel kernels[VECTOR_SIZE_COUNT];
224     float maxError = 0.0f;
225     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
226     float maxErrorVal = 0.0f;
227     float maxErrorVal2 = 0.0f;
228     float maxErrorVal3 = 0.0f;
229     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
230 
231     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
232 
233     float float_ulps;
234     if (gIsEmbedded)
235         float_ulps = f->float_embedded_ulps;
236     else
237         float_ulps = f->float_ulps;
238 
239     int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
240 
241     // Init the kernels
242     {
243         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
244                                        f->nameInCode, relaxedMode };
245         if ((error = ThreadPool_Do(BuildKernelFn,
246                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
247                                    &build_info)))
248             return error;
249     }
250 
251     for (uint64_t i = 0; i < (1ULL << 32); i += step)
252     {
253         // Init input array
254         cl_uint *p = (cl_uint *)gIn;
255         cl_uint *p2 = (cl_uint *)gIn2;
256         cl_uint *p3 = (cl_uint *)gIn3;
257         size_t idx = 0;
258 
259         if (i == 0)
260         { // test edge cases
261             float *fp = (float *)gIn;
262             float *fp2 = (float *)gIn2;
263             float *fp3 = (float *)gIn3;
264             uint32_t x, y, z;
265             x = y = z = 0;
266             for (; idx < BUFFER_SIZE / sizeof(float); idx++)
267             {
268                 fp[idx] = specialValues[x];
269                 fp2[idx] = specialValues[y];
270                 fp3[idx] = specialValues[z];
271 
272                 if (++x >= specialValuesCount)
273                 {
274                     x = 0;
275                     if (++y >= specialValuesCount)
276                     {
277                         y = 0;
278                         if (++z >= specialValuesCount) break;
279                     }
280                 }
281             }
282             if (idx == BUFFER_SIZE / sizeof(float))
283                 vlog_error("Test Error: not all special cases tested!\n");
284         }
285 
286         for (; idx < BUFFER_SIZE / sizeof(float); idx++)
287         {
288             p[idx] = genrand_int32(d);
289             p2[idx] = genrand_int32(d);
290             p3[idx] = genrand_int32(d);
291         }
292 
293         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
294                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
295         {
296             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
297             return error;
298         }
299 
300         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
301                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
302         {
303             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
304             return error;
305         }
306 
307         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
308                                           BUFFER_SIZE, gIn3, 0, NULL, NULL)))
309         {
310             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
311             return error;
312         }
313 
314         // write garbage into output arrays
315         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
316         {
317             uint32_t pattern = 0xffffdead;
318             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
319             if ((error =
320                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
321                                           BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
322             {
323                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
324                            error, j);
325                 goto exit;
326             }
327         }
328 
329         // Run the kernels
330         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
331         {
332             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
333             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
334                 / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
335             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
336                                         &gOutBuffer[j])))
337             {
338                 LogBuildError(programs[j]);
339                 goto exit;
340             }
341             if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
342                                         &gInBuffer)))
343             {
344                 LogBuildError(programs[j]);
345                 goto exit;
346             }
347             if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
348                                         &gInBuffer2)))
349             {
350                 LogBuildError(programs[j]);
351                 goto exit;
352             }
353             if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
354                                         &gInBuffer3)))
355             {
356                 LogBuildError(programs[j]);
357                 goto exit;
358             }
359 
360             if ((error =
361                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
362                                             &localCount, NULL, 0, NULL, NULL)))
363             {
364                 vlog_error("FAILED -- could not execute kernel\n");
365                 goto exit;
366             }
367         }
368 
369         // Get that moving
370         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
371 
372         // Calculate the correctly rounded reference result
373         float *r = (float *)gOut_Ref;
374         float *s = (float *)gIn;
375         float *s2 = (float *)gIn2;
376         float *s3 = (float *)gIn3;
377         if (skipNanInf)
378         {
379             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
380             {
381                 feclearexcept(FE_OVERFLOW);
382                 r[j] =
383                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
384                 overflow[j] =
385                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
386             }
387         }
388         else
389         {
390             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
391                 r[j] =
392                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
393         }
394 
395         // Read the data back
396         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
397         {
398             if ((error =
399                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
400                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
401             {
402                 vlog_error("ReadArray failed %d\n", error);
403                 goto exit;
404             }
405         }
406 
407         if (gSkipCorrectnessTesting) break;
408 
409         // Verify data
410         uint32_t *t = (uint32_t *)gOut_Ref;
411         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
412         {
413             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
414             {
415                 uint32_t *q = (uint32_t *)(gOut[k]);
416 
417                 // If we aren't getting the correctly rounded result
418                 if (t[j] != q[j])
419                 {
420                     float err;
421                     int fail;
422                     float test = ((float *)q)[j];
423                     float correct =
424                         f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
425 
426                     // Per section 10 paragraph 6, accept any result if an input
427                     // or output is a infinity or NaN or overflow
428                     if (skipNanInf)
429                     {
430                         if (overflow[j] || IsFloatInfinity(correct)
431                             || IsFloatNaN(correct) || IsFloatInfinity(s[j])
432                             || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j])
433                             || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j])
434                             || IsFloatNaN(s3[j]))
435                             continue;
436                     }
437 
438 
439                     err = Ulp_Error(test, correct);
440                     fail = !(fabsf(err) <= float_ulps);
441 
442                     if (fail && ftz)
443                     {
444                         float correct2, err2;
445 
446                         // retry per section 6.5.3.2  with flushing on
447                         if (0.0f == test
448                             && 0.0f
449                                 == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED))
450                         {
451                             fail = 0;
452                             err = 0.0f;
453                         }
454 
455                         // retry per section 6.5.3.3
456                         if (fail && IsFloatSubnormal(s[j]))
457                         { // look at me,
458                             float err3, correct3;
459 
460                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
461 
462                             correct2 = f->func.f_fma(0.0f, s2[j], s3[j],
463                                                      CORRECTLY_ROUNDED);
464                             correct3 = f->func.f_fma(-0.0f, s2[j], s3[j],
465                                                      CORRECTLY_ROUNDED);
466 
467                             if (skipNanInf)
468                             {
469                                 if (fetestexcept(FE_OVERFLOW)) continue;
470 
471                                 // Note: no double rounding here.  Reference
472                                 // functions calculate in single precision.
473                                 if (IsFloatInfinity(correct2)
474                                     || IsFloatNaN(correct2)
475                                     || IsFloatInfinity(correct3)
476                                     || IsFloatNaN(correct3))
477                                     continue;
478                             }
479 
480                             err2 = Ulp_Error(test, correct2);
481                             err3 = Ulp_Error(test, correct3);
482                             fail = fail
483                                 && ((!(fabsf(err2) <= float_ulps))
484                                     && (!(fabsf(err3) <= float_ulps)));
485                             if (fabsf(err2) < fabsf(err)) err = err2;
486                             if (fabsf(err3) < fabsf(err)) err = err3;
487 
488                             // retry per section 6.5.3.4
489                             if (0.0f == test
490                                 && (0.0f
491                                         == f->func.f_fma(0.0f, s2[j], s3[j],
492                                                          FLUSHED)
493                                     || 0.0f
494                                         == f->func.f_fma(-0.0f, s2[j], s3[j],
495                                                          FLUSHED)))
496                             {
497                                 fail = 0;
498                                 err = 0.0f;
499                             }
500 
501                             // try with first two args as zero
502                             if (IsFloatSubnormal(s2[j]))
503                             { // its fun to have fun,
504                                 double correct4, correct5;
505                                 float err4, err5;
506 
507                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
508 
509                                 correct2 = f->func.f_fma(0.0f, 0.0f, s3[j],
510                                                          CORRECTLY_ROUNDED);
511                                 correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j],
512                                                          CORRECTLY_ROUNDED);
513                                 correct4 = f->func.f_fma(0.0f, -0.0f, s3[j],
514                                                          CORRECTLY_ROUNDED);
515                                 correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j],
516                                                          CORRECTLY_ROUNDED);
517 
518                                 // Per section 10 paragraph 6, accept any result
519                                 // if an input or output is a infinity or NaN or
520                                 // overflow
521                                 if (!gInfNanSupport)
522                                 {
523                                     if (fetestexcept(FE_OVERFLOW)) continue;
524 
525                                     // Note: no double rounding here.  Reference
526                                     // functions calculate in single precision.
527                                     if (IsFloatInfinity(correct2)
528                                         || IsFloatNaN(correct2)
529                                         || IsFloatInfinity(correct3)
530                                         || IsFloatNaN(correct3)
531                                         || IsFloatInfinity(correct4)
532                                         || IsFloatNaN(correct4)
533                                         || IsFloatInfinity(correct5)
534                                         || IsFloatNaN(correct5))
535                                         continue;
536                                 }
537 
538                                 err2 = Ulp_Error(test, correct2);
539                                 err3 = Ulp_Error(test, correct3);
540                                 err4 = Ulp_Error(test, correct4);
541                                 err5 = Ulp_Error(test, correct5);
542                                 fail = fail
543                                     && ((!(fabsf(err2) <= float_ulps))
544                                         && (!(fabsf(err3) <= float_ulps))
545                                         && (!(fabsf(err4) <= float_ulps))
546                                         && (!(fabsf(err5) <= float_ulps)));
547                                 if (fabsf(err2) < fabsf(err)) err = err2;
548                                 if (fabsf(err3) < fabsf(err)) err = err3;
549                                 if (fabsf(err4) < fabsf(err)) err = err4;
550                                 if (fabsf(err5) < fabsf(err)) err = err5;
551 
552                                 // retry per section 6.5.3.4
553                                 if (0.0f == test
554                                     && (0.0f
555                                             == f->func.f_fma(0.0f, 0.0f, s3[j],
556                                                              FLUSHED)
557                                         || 0.0f
558                                             == f->func.f_fma(-0.0f, 0.0f, s3[j],
559                                                              FLUSHED)
560                                         || 0.0f
561                                             == f->func.f_fma(0.0f, -0.0f, s3[j],
562                                                              FLUSHED)
563                                         || 0.0f
564                                             == f->func.f_fma(-0.0f, -0.0f,
565                                                              s3[j], FLUSHED)))
566                                 {
567                                     fail = 0;
568                                     err = 0.0f;
569                                 }
570 
571                                 if (IsFloatSubnormal(s3[j]))
572                                 {
573                                     if (test == 0.0f) // 0*0+0 is 0
574                                     {
575                                         fail = 0;
576                                         err = 0.0f;
577                                     }
578                                 }
579                             }
580                             else if (IsFloatSubnormal(s3[j]))
581                             {
582                                 double correct4, correct5;
583                                 float err4, err5;
584 
585                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
586 
587                                 correct2 = f->func.f_fma(0.0f, s2[j], 0.0f,
588                                                          CORRECTLY_ROUNDED);
589                                 correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f,
590                                                          CORRECTLY_ROUNDED);
591                                 correct4 = f->func.f_fma(0.0f, s2[j], -0.0f,
592                                                          CORRECTLY_ROUNDED);
593                                 correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f,
594                                                          CORRECTLY_ROUNDED);
595 
596                                 // Per section 10 paragraph 6, accept any result
597                                 // if an input or output is a infinity or NaN or
598                                 // overflow
599                                 if (!gInfNanSupport)
600                                 {
601                                     if (fetestexcept(FE_OVERFLOW)) continue;
602 
603                                     // Note: no double rounding here.  Reference
604                                     // functions calculate in single precision.
605                                     if (IsFloatInfinity(correct2)
606                                         || IsFloatNaN(correct2)
607                                         || IsFloatInfinity(correct3)
608                                         || IsFloatNaN(correct3)
609                                         || IsFloatInfinity(correct4)
610                                         || IsFloatNaN(correct4)
611                                         || IsFloatInfinity(correct5)
612                                         || IsFloatNaN(correct5))
613                                         continue;
614                                 }
615 
616                                 err2 = Ulp_Error(test, correct2);
617                                 err3 = Ulp_Error(test, correct3);
618                                 err4 = Ulp_Error(test, correct4);
619                                 err5 = Ulp_Error(test, correct5);
620                                 fail = fail
621                                     && ((!(fabsf(err2) <= float_ulps))
622                                         && (!(fabsf(err3) <= float_ulps))
623                                         && (!(fabsf(err4) <= float_ulps))
624                                         && (!(fabsf(err5) <= float_ulps)));
625                                 if (fabsf(err2) < fabsf(err)) err = err2;
626                                 if (fabsf(err3) < fabsf(err)) err = err3;
627                                 if (fabsf(err4) < fabsf(err)) err = err4;
628                                 if (fabsf(err5) < fabsf(err)) err = err5;
629 
630                                 // retry per section 6.5.3.4
631                                 if (0.0f == test
632                                     && (0.0f
633                                             == f->func.f_fma(0.0f, s2[j], 0.0f,
634                                                              FLUSHED)
635                                         || 0.0f
636                                             == f->func.f_fma(-0.0f, s2[j], 0.0f,
637                                                              FLUSHED)
638                                         || 0.0f
639                                             == f->func.f_fma(0.0f, s2[j], -0.0f,
640                                                              FLUSHED)
641                                         || 0.0f
642                                             == f->func.f_fma(-0.0f, s2[j],
643                                                              -0.0f, FLUSHED)))
644                                 {
645                                     fail = 0;
646                                     err = 0.0f;
647                                 }
648                             }
649                         }
650                         else if (fail && IsFloatSubnormal(s2[j]))
651                         {
652                             double correct2, correct3;
653                             float err2, err3;
654 
655                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
656 
657                             correct2 = f->func.f_fma(s[j], 0.0f, s3[j],
658                                                      CORRECTLY_ROUNDED);
659                             correct3 = f->func.f_fma(s[j], -0.0f, s3[j],
660                                                      CORRECTLY_ROUNDED);
661 
662                             if (skipNanInf)
663                             {
664                                 if (fetestexcept(FE_OVERFLOW)) continue;
665 
666                                 // Note: no double rounding here.  Reference
667                                 // functions calculate in single precision.
668                                 if (IsFloatInfinity(correct2)
669                                     || IsFloatNaN(correct2)
670                                     || IsFloatInfinity(correct3)
671                                     || IsFloatNaN(correct3))
672                                     continue;
673                             }
674 
675                             err2 = Ulp_Error(test, correct2);
676                             err3 = Ulp_Error(test, correct3);
677                             fail = fail
678                                 && ((!(fabsf(err2) <= float_ulps))
679                                     && (!(fabsf(err3) <= float_ulps)));
680                             if (fabsf(err2) < fabsf(err)) err = err2;
681                             if (fabsf(err3) < fabsf(err)) err = err3;
682 
683                             // retry per section 6.5.3.4
684                             if (0.0f == test
685                                 && (0.0f
686                                         == f->func.f_fma(s[j], 0.0f, s3[j],
687                                                          FLUSHED)
688                                     || 0.0f
689                                         == f->func.f_fma(s[j], -0.0f, s3[j],
690                                                          FLUSHED)))
691                             {
692                                 fail = 0;
693                                 err = 0.0f;
694                             }
695 
696                             // try with second two args as zero
697                             if (IsFloatSubnormal(s3[j]))
698                             {
699                                 double correct4, correct5;
700                                 float err4, err5;
701 
702                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
703 
704                                 correct2 = f->func.f_fma(s[j], 0.0f, 0.0f,
705                                                          CORRECTLY_ROUNDED);
706                                 correct3 = f->func.f_fma(s[j], -0.0f, 0.0f,
707                                                          CORRECTLY_ROUNDED);
708                                 correct4 = f->func.f_fma(s[j], 0.0f, -0.0f,
709                                                          CORRECTLY_ROUNDED);
710                                 correct5 = f->func.f_fma(s[j], -0.0f, -0.0f,
711                                                          CORRECTLY_ROUNDED);
712 
713                                 // Per section 10 paragraph 6, accept any result
714                                 // if an input or output is a infinity or NaN or
715                                 // overflow
716                                 if (!gInfNanSupport)
717                                 {
718                                     if (fetestexcept(FE_OVERFLOW)) continue;
719 
720                                     // Note: no double rounding here.  Reference
721                                     // functions calculate in single precision.
722                                     if (IsFloatInfinity(correct2)
723                                         || IsFloatNaN(correct2)
724                                         || IsFloatInfinity(correct3)
725                                         || IsFloatNaN(correct3)
726                                         || IsFloatInfinity(correct4)
727                                         || IsFloatNaN(correct4)
728                                         || IsFloatInfinity(correct5)
729                                         || IsFloatNaN(correct5))
730                                         continue;
731                                 }
732 
733                                 err2 = Ulp_Error(test, correct2);
734                                 err3 = Ulp_Error(test, correct3);
735                                 err4 = Ulp_Error(test, correct4);
736                                 err5 = Ulp_Error(test, correct5);
737                                 fail = fail
738                                     && ((!(fabsf(err2) <= float_ulps))
739                                         && (!(fabsf(err3) <= float_ulps))
740                                         && (!(fabsf(err4) <= float_ulps))
741                                         && (!(fabsf(err5) <= float_ulps)));
742                                 if (fabsf(err2) < fabsf(err)) err = err2;
743                                 if (fabsf(err3) < fabsf(err)) err = err3;
744                                 if (fabsf(err4) < fabsf(err)) err = err4;
745                                 if (fabsf(err5) < fabsf(err)) err = err5;
746 
747                                 // retry per section 6.5.3.4
748                                 if (0.0f == test
749                                     && (0.0f
750                                             == f->func.f_fma(s[j], 0.0f, 0.0f,
751                                                              FLUSHED)
752                                         || 0.0f
753                                             == f->func.f_fma(s[j], -0.0f, 0.0f,
754                                                              FLUSHED)
755                                         || 0.0f
756                                             == f->func.f_fma(s[j], 0.0f, -0.0f,
757                                                              FLUSHED)
758                                         || 0.0f
759                                             == f->func.f_fma(s[j], -0.0f, -0.0f,
760                                                              FLUSHED)))
761                                 {
762                                     fail = 0;
763                                     err = 0.0f;
764                                 }
765                             }
766                         }
767                         else if (fail && IsFloatSubnormal(s3[j]))
768                         {
769                             double correct2, correct3;
770                             float err2, err3;
771 
772                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
773 
774                             correct2 = f->func.f_fma(s[j], s2[j], 0.0f,
775                                                      CORRECTLY_ROUNDED);
776                             correct3 = f->func.f_fma(s[j], s2[j], -0.0f,
777                                                      CORRECTLY_ROUNDED);
778 
779                             if (skipNanInf)
780                             {
781                                 if (fetestexcept(FE_OVERFLOW)) continue;
782 
783                                 // Note: no double rounding here.  Reference
784                                 // functions calculate in single precision.
785                                 if (IsFloatInfinity(correct2)
786                                     || IsFloatNaN(correct2)
787                                     || IsFloatInfinity(correct3)
788                                     || IsFloatNaN(correct3))
789                                     continue;
790                             }
791 
792                             err2 = Ulp_Error(test, correct2);
793                             err3 = Ulp_Error(test, correct3);
794                             fail = fail
795                                 && ((!(fabsf(err2) <= float_ulps))
796                                     && (!(fabsf(err3) <= float_ulps)));
797                             if (fabsf(err2) < fabsf(err)) err = err2;
798                             if (fabsf(err3) < fabsf(err)) err = err3;
799 
800                             // retry per section 6.5.3.4
801                             if (0.0f == test
802                                 && (0.0f
803                                         == f->func.f_fma(s[j], s2[j], 0.0f,
804                                                          FLUSHED)
805                                     || 0.0f
806                                         == f->func.f_fma(s[j], s2[j], -0.0f,
807                                                          FLUSHED)))
808                             {
809                                 fail = 0;
810                                 err = 0.0f;
811                             }
812                         }
813                     }
814 
815                     if (fabsf(err) > maxError)
816                     {
817                         maxError = fabsf(err);
818                         maxErrorVal = s[j];
819                         maxErrorVal2 = s2[j];
820                         maxErrorVal3 = s3[j];
821                     }
822 
823                     if (fail)
824                     {
825                         vlog_error(
826                             "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
827                             "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n",
828                             f->name, sizeNames[k], err, s[j], s2[j], s3[j],
829                             ((cl_uint *)s)[j], ((cl_uint *)s2)[j],
830                             ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test);
831                         error = -1;
832                         goto exit;
833                     }
834                 }
835             }
836         }
837 
838         if (0 == (i & 0x0fffffff))
839         {
840             if (gVerboseBruteForce)
841             {
842                 vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
843                      BUFFER_SIZE);
844             }
845             else
846             {
847                 vlog(".");
848             }
849             fflush(stdout);
850         }
851     }
852 
853     if (!gSkipCorrectnessTesting)
854     {
855         if (gWimpyMode)
856             vlog("Wimp pass");
857         else
858             vlog("passed");
859 
860         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
861              maxErrorVal3);
862     }
863 
864     vlog("\n");
865 
866 exit:
867     // Release
868     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
869     {
870         clReleaseKernel(kernels[k]);
871         clReleaseProgram(programs[k]);
872     }
873 
874     return error;
875 }
876