1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
23 #define CORRECTLY_ROUNDED 0
24 #define FLUSHED 1
25
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)26 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
27 cl_program *p, bool relaxedMode)
28 {
29 const char *c[] = { "__kernel void math_kernel",
30 sizeNames[vectorSize],
31 "( __global float",
32 sizeNames[vectorSize],
33 "* out, __global float",
34 sizeNames[vectorSize],
35 "* in1, __global float",
36 sizeNames[vectorSize],
37 "* in2, __global float",
38 sizeNames[vectorSize],
39 "* in3 )\n"
40 "{\n"
41 " size_t i = get_global_id(0);\n"
42 " out[i] = ",
43 name,
44 "( in1[i], in2[i], in3[i] );\n"
45 "}\n" };
46
47 const char *c3[] = {
48 "__kernel void math_kernel",
49 sizeNames[vectorSize],
50 "( __global float* out, __global float* in, __global float* in2, "
51 "__global float* in3)\n"
52 "{\n"
53 " size_t i = get_global_id(0);\n"
54 " if( i + 1 < get_global_size(0) )\n"
55 " {\n"
56 " float3 f0 = vload3( 0, in + 3 * i );\n"
57 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
58 " float3 f2 = vload3( 0, in3 + 3 * i );\n"
59 " f0 = ",
60 name,
61 "( f0, f1, f2 );\n"
62 " vstore3( f0, 0, out + 3*i );\n"
63 " }\n"
64 " else\n"
65 " {\n"
66 " size_t parity = i & 1; // Figure out how many elements are "
67 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
68 "buffer size \n"
69 " float3 f0;\n"
70 " float3 f1;\n"
71 " float3 f2;\n"
72 " switch( parity )\n"
73 " {\n"
74 " case 1:\n"
75 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
76 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
77 " f2 = (float3)( in3[3*i], NAN, NAN ); \n"
78 " break;\n"
79 " case 0:\n"
80 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
81 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
82 " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
83 " break;\n"
84 " }\n"
85 " f0 = ",
86 name,
87 "( f0, f1, f2 );\n"
88 " switch( parity )\n"
89 " {\n"
90 " case 0:\n"
91 " out[3*i+1] = f0.y; \n"
92 " // fall through\n"
93 " case 1:\n"
94 " out[3*i] = f0.x; \n"
95 " break;\n"
96 " }\n"
97 " }\n"
98 "}\n"
99 };
100
101 const char **kern = c;
102 size_t kernSize = sizeof(c) / sizeof(c[0]);
103
104 if (sizeValues[vectorSize] == 3)
105 {
106 kern = c3;
107 kernSize = sizeof(c3) / sizeof(c3[0]);
108 }
109
110 char testName[32];
111 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
112 sizeNames[vectorSize]);
113
114 return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
115 }
116
117 typedef struct BuildKernelInfo
118 {
119 cl_uint offset; // the first vector size to build
120 cl_kernel *kernels;
121 cl_program *programs;
122 const char *nameInCode;
123 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
124 } BuildKernelInfo;
125
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)126 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
127 {
128 BuildKernelInfo *info = (BuildKernelInfo *)p;
129 cl_uint i = info->offset + job_id;
130 return BuildKernel(info->nameInCode, i, info->kernels + i,
131 info->programs + i, info->relaxedMode);
132 }
133
134 // A table of more difficult cases to get right
135 static const float specialValues[] = {
136 -NAN,
137 -INFINITY,
138 -FLT_MAX,
139 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
140 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
141 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
142 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
143 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
144 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
145 -3.0f,
146 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
147 -2.5f,
148 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
149 -2.0f,
150 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
151 -1.75f,
152 -1.5f,
153 -1.25f,
154 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
155 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
156 MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24),
157 -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
158 -1.0f,
159 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
160 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
161 -FLT_MIN,
162 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
163 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
164 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
165 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
166 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
167 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
168 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
169 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
170 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
171 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
172 -0.0f,
173
174 +NAN,
175 +INFINITY,
176 +FLT_MAX,
177 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
178 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
179 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
180 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
181 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
182 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
183 +3.0f,
184 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
185 2.5f,
186 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
187 +2.0f,
188 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
189 1.75f,
190 1.5f,
191 1.25f,
192 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
193 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
194 MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24),
195 +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
196 +1.0f,
197 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
198 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
199 +FLT_MIN,
200 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
201 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
202 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
203 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
204 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
205 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
206 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
207 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
208 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
209 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
210 +0.0f,
211 };
212
213 static const size_t specialValuesCount =
214 sizeof(specialValues) / sizeof(specialValues[0]);
215
TestFunc_Float_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)216 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
217 {
218 int error;
219
220 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
221
222 cl_program programs[VECTOR_SIZE_COUNT];
223 cl_kernel kernels[VECTOR_SIZE_COUNT];
224 float maxError = 0.0f;
225 int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
226 float maxErrorVal = 0.0f;
227 float maxErrorVal2 = 0.0f;
228 float maxErrorVal3 = 0.0f;
229 uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
230
231 cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
232
233 float float_ulps;
234 if (gIsEmbedded)
235 float_ulps = f->float_embedded_ulps;
236 else
237 float_ulps = f->float_ulps;
238
239 int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
240
241 // Init the kernels
242 {
243 BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
244 f->nameInCode, relaxedMode };
245 if ((error = ThreadPool_Do(BuildKernelFn,
246 gMaxVectorSizeIndex - gMinVectorSizeIndex,
247 &build_info)))
248 return error;
249 }
250
251 for (uint64_t i = 0; i < (1ULL << 32); i += step)
252 {
253 // Init input array
254 cl_uint *p = (cl_uint *)gIn;
255 cl_uint *p2 = (cl_uint *)gIn2;
256 cl_uint *p3 = (cl_uint *)gIn3;
257 size_t idx = 0;
258
259 if (i == 0)
260 { // test edge cases
261 float *fp = (float *)gIn;
262 float *fp2 = (float *)gIn2;
263 float *fp3 = (float *)gIn3;
264 uint32_t x, y, z;
265 x = y = z = 0;
266 for (; idx < BUFFER_SIZE / sizeof(float); idx++)
267 {
268 fp[idx] = specialValues[x];
269 fp2[idx] = specialValues[y];
270 fp3[idx] = specialValues[z];
271
272 if (++x >= specialValuesCount)
273 {
274 x = 0;
275 if (++y >= specialValuesCount)
276 {
277 y = 0;
278 if (++z >= specialValuesCount) break;
279 }
280 }
281 }
282 if (idx == BUFFER_SIZE / sizeof(float))
283 vlog_error("Test Error: not all special cases tested!\n");
284 }
285
286 for (; idx < BUFFER_SIZE / sizeof(float); idx++)
287 {
288 p[idx] = genrand_int32(d);
289 p2[idx] = genrand_int32(d);
290 p3[idx] = genrand_int32(d);
291 }
292
293 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
294 BUFFER_SIZE, gIn, 0, NULL, NULL)))
295 {
296 vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
297 return error;
298 }
299
300 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
301 BUFFER_SIZE, gIn2, 0, NULL, NULL)))
302 {
303 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
304 return error;
305 }
306
307 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
308 BUFFER_SIZE, gIn3, 0, NULL, NULL)))
309 {
310 vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
311 return error;
312 }
313
314 // write garbage into output arrays
315 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
316 {
317 uint32_t pattern = 0xffffdead;
318 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
319 if ((error =
320 clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
321 BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
322 {
323 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
324 error, j);
325 goto exit;
326 }
327 }
328
329 // Run the kernels
330 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
331 {
332 size_t vectorSize = sizeof(cl_float) * sizeValues[j];
333 size_t localCount = (BUFFER_SIZE + vectorSize - 1)
334 / vectorSize; // BUFFER_SIZE / vectorSize rounded up
335 if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
336 &gOutBuffer[j])))
337 {
338 LogBuildError(programs[j]);
339 goto exit;
340 }
341 if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
342 &gInBuffer)))
343 {
344 LogBuildError(programs[j]);
345 goto exit;
346 }
347 if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
348 &gInBuffer2)))
349 {
350 LogBuildError(programs[j]);
351 goto exit;
352 }
353 if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
354 &gInBuffer3)))
355 {
356 LogBuildError(programs[j]);
357 goto exit;
358 }
359
360 if ((error =
361 clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
362 &localCount, NULL, 0, NULL, NULL)))
363 {
364 vlog_error("FAILED -- could not execute kernel\n");
365 goto exit;
366 }
367 }
368
369 // Get that moving
370 if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
371
372 // Calculate the correctly rounded reference result
373 float *r = (float *)gOut_Ref;
374 float *s = (float *)gIn;
375 float *s2 = (float *)gIn2;
376 float *s3 = (float *)gIn3;
377 if (skipNanInf)
378 {
379 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
380 {
381 feclearexcept(FE_OVERFLOW);
382 r[j] =
383 (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
384 overflow[j] =
385 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
386 }
387 }
388 else
389 {
390 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
391 r[j] =
392 (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
393 }
394
395 // Read the data back
396 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
397 {
398 if ((error =
399 clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
400 BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
401 {
402 vlog_error("ReadArray failed %d\n", error);
403 goto exit;
404 }
405 }
406
407 if (gSkipCorrectnessTesting) break;
408
409 // Verify data
410 uint32_t *t = (uint32_t *)gOut_Ref;
411 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
412 {
413 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
414 {
415 uint32_t *q = (uint32_t *)(gOut[k]);
416
417 // If we aren't getting the correctly rounded result
418 if (t[j] != q[j])
419 {
420 float err;
421 int fail;
422 float test = ((float *)q)[j];
423 float correct =
424 f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
425
426 // Per section 10 paragraph 6, accept any result if an input
427 // or output is a infinity or NaN or overflow
428 if (skipNanInf)
429 {
430 if (overflow[j] || IsFloatInfinity(correct)
431 || IsFloatNaN(correct) || IsFloatInfinity(s[j])
432 || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j])
433 || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j])
434 || IsFloatNaN(s3[j]))
435 continue;
436 }
437
438
439 err = Ulp_Error(test, correct);
440 fail = !(fabsf(err) <= float_ulps);
441
442 if (fail && ftz)
443 {
444 float correct2, err2;
445
446 // retry per section 6.5.3.2 with flushing on
447 if (0.0f == test
448 && 0.0f
449 == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED))
450 {
451 fail = 0;
452 err = 0.0f;
453 }
454
455 // retry per section 6.5.3.3
456 if (fail && IsFloatSubnormal(s[j]))
457 { // look at me,
458 float err3, correct3;
459
460 if (skipNanInf) feclearexcept(FE_OVERFLOW);
461
462 correct2 = f->func.f_fma(0.0f, s2[j], s3[j],
463 CORRECTLY_ROUNDED);
464 correct3 = f->func.f_fma(-0.0f, s2[j], s3[j],
465 CORRECTLY_ROUNDED);
466
467 if (skipNanInf)
468 {
469 if (fetestexcept(FE_OVERFLOW)) continue;
470
471 // Note: no double rounding here. Reference
472 // functions calculate in single precision.
473 if (IsFloatInfinity(correct2)
474 || IsFloatNaN(correct2)
475 || IsFloatInfinity(correct3)
476 || IsFloatNaN(correct3))
477 continue;
478 }
479
480 err2 = Ulp_Error(test, correct2);
481 err3 = Ulp_Error(test, correct3);
482 fail = fail
483 && ((!(fabsf(err2) <= float_ulps))
484 && (!(fabsf(err3) <= float_ulps)));
485 if (fabsf(err2) < fabsf(err)) err = err2;
486 if (fabsf(err3) < fabsf(err)) err = err3;
487
488 // retry per section 6.5.3.4
489 if (0.0f == test
490 && (0.0f
491 == f->func.f_fma(0.0f, s2[j], s3[j],
492 FLUSHED)
493 || 0.0f
494 == f->func.f_fma(-0.0f, s2[j], s3[j],
495 FLUSHED)))
496 {
497 fail = 0;
498 err = 0.0f;
499 }
500
501 // try with first two args as zero
502 if (IsFloatSubnormal(s2[j]))
503 { // its fun to have fun,
504 double correct4, correct5;
505 float err4, err5;
506
507 if (skipNanInf) feclearexcept(FE_OVERFLOW);
508
509 correct2 = f->func.f_fma(0.0f, 0.0f, s3[j],
510 CORRECTLY_ROUNDED);
511 correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j],
512 CORRECTLY_ROUNDED);
513 correct4 = f->func.f_fma(0.0f, -0.0f, s3[j],
514 CORRECTLY_ROUNDED);
515 correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j],
516 CORRECTLY_ROUNDED);
517
518 // Per section 10 paragraph 6, accept any result
519 // if an input or output is a infinity or NaN or
520 // overflow
521 if (!gInfNanSupport)
522 {
523 if (fetestexcept(FE_OVERFLOW)) continue;
524
525 // Note: no double rounding here. Reference
526 // functions calculate in single precision.
527 if (IsFloatInfinity(correct2)
528 || IsFloatNaN(correct2)
529 || IsFloatInfinity(correct3)
530 || IsFloatNaN(correct3)
531 || IsFloatInfinity(correct4)
532 || IsFloatNaN(correct4)
533 || IsFloatInfinity(correct5)
534 || IsFloatNaN(correct5))
535 continue;
536 }
537
538 err2 = Ulp_Error(test, correct2);
539 err3 = Ulp_Error(test, correct3);
540 err4 = Ulp_Error(test, correct4);
541 err5 = Ulp_Error(test, correct5);
542 fail = fail
543 && ((!(fabsf(err2) <= float_ulps))
544 && (!(fabsf(err3) <= float_ulps))
545 && (!(fabsf(err4) <= float_ulps))
546 && (!(fabsf(err5) <= float_ulps)));
547 if (fabsf(err2) < fabsf(err)) err = err2;
548 if (fabsf(err3) < fabsf(err)) err = err3;
549 if (fabsf(err4) < fabsf(err)) err = err4;
550 if (fabsf(err5) < fabsf(err)) err = err5;
551
552 // retry per section 6.5.3.4
553 if (0.0f == test
554 && (0.0f
555 == f->func.f_fma(0.0f, 0.0f, s3[j],
556 FLUSHED)
557 || 0.0f
558 == f->func.f_fma(-0.0f, 0.0f, s3[j],
559 FLUSHED)
560 || 0.0f
561 == f->func.f_fma(0.0f, -0.0f, s3[j],
562 FLUSHED)
563 || 0.0f
564 == f->func.f_fma(-0.0f, -0.0f,
565 s3[j], FLUSHED)))
566 {
567 fail = 0;
568 err = 0.0f;
569 }
570
571 if (IsFloatSubnormal(s3[j]))
572 {
573 if (test == 0.0f) // 0*0+0 is 0
574 {
575 fail = 0;
576 err = 0.0f;
577 }
578 }
579 }
580 else if (IsFloatSubnormal(s3[j]))
581 {
582 double correct4, correct5;
583 float err4, err5;
584
585 if (skipNanInf) feclearexcept(FE_OVERFLOW);
586
587 correct2 = f->func.f_fma(0.0f, s2[j], 0.0f,
588 CORRECTLY_ROUNDED);
589 correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f,
590 CORRECTLY_ROUNDED);
591 correct4 = f->func.f_fma(0.0f, s2[j], -0.0f,
592 CORRECTLY_ROUNDED);
593 correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f,
594 CORRECTLY_ROUNDED);
595
596 // Per section 10 paragraph 6, accept any result
597 // if an input or output is a infinity or NaN or
598 // overflow
599 if (!gInfNanSupport)
600 {
601 if (fetestexcept(FE_OVERFLOW)) continue;
602
603 // Note: no double rounding here. Reference
604 // functions calculate in single precision.
605 if (IsFloatInfinity(correct2)
606 || IsFloatNaN(correct2)
607 || IsFloatInfinity(correct3)
608 || IsFloatNaN(correct3)
609 || IsFloatInfinity(correct4)
610 || IsFloatNaN(correct4)
611 || IsFloatInfinity(correct5)
612 || IsFloatNaN(correct5))
613 continue;
614 }
615
616 err2 = Ulp_Error(test, correct2);
617 err3 = Ulp_Error(test, correct3);
618 err4 = Ulp_Error(test, correct4);
619 err5 = Ulp_Error(test, correct5);
620 fail = fail
621 && ((!(fabsf(err2) <= float_ulps))
622 && (!(fabsf(err3) <= float_ulps))
623 && (!(fabsf(err4) <= float_ulps))
624 && (!(fabsf(err5) <= float_ulps)));
625 if (fabsf(err2) < fabsf(err)) err = err2;
626 if (fabsf(err3) < fabsf(err)) err = err3;
627 if (fabsf(err4) < fabsf(err)) err = err4;
628 if (fabsf(err5) < fabsf(err)) err = err5;
629
630 // retry per section 6.5.3.4
631 if (0.0f == test
632 && (0.0f
633 == f->func.f_fma(0.0f, s2[j], 0.0f,
634 FLUSHED)
635 || 0.0f
636 == f->func.f_fma(-0.0f, s2[j], 0.0f,
637 FLUSHED)
638 || 0.0f
639 == f->func.f_fma(0.0f, s2[j], -0.0f,
640 FLUSHED)
641 || 0.0f
642 == f->func.f_fma(-0.0f, s2[j],
643 -0.0f, FLUSHED)))
644 {
645 fail = 0;
646 err = 0.0f;
647 }
648 }
649 }
650 else if (fail && IsFloatSubnormal(s2[j]))
651 {
652 double correct2, correct3;
653 float err2, err3;
654
655 if (skipNanInf) feclearexcept(FE_OVERFLOW);
656
657 correct2 = f->func.f_fma(s[j], 0.0f, s3[j],
658 CORRECTLY_ROUNDED);
659 correct3 = f->func.f_fma(s[j], -0.0f, s3[j],
660 CORRECTLY_ROUNDED);
661
662 if (skipNanInf)
663 {
664 if (fetestexcept(FE_OVERFLOW)) continue;
665
666 // Note: no double rounding here. Reference
667 // functions calculate in single precision.
668 if (IsFloatInfinity(correct2)
669 || IsFloatNaN(correct2)
670 || IsFloatInfinity(correct3)
671 || IsFloatNaN(correct3))
672 continue;
673 }
674
675 err2 = Ulp_Error(test, correct2);
676 err3 = Ulp_Error(test, correct3);
677 fail = fail
678 && ((!(fabsf(err2) <= float_ulps))
679 && (!(fabsf(err3) <= float_ulps)));
680 if (fabsf(err2) < fabsf(err)) err = err2;
681 if (fabsf(err3) < fabsf(err)) err = err3;
682
683 // retry per section 6.5.3.4
684 if (0.0f == test
685 && (0.0f
686 == f->func.f_fma(s[j], 0.0f, s3[j],
687 FLUSHED)
688 || 0.0f
689 == f->func.f_fma(s[j], -0.0f, s3[j],
690 FLUSHED)))
691 {
692 fail = 0;
693 err = 0.0f;
694 }
695
696 // try with second two args as zero
697 if (IsFloatSubnormal(s3[j]))
698 {
699 double correct4, correct5;
700 float err4, err5;
701
702 if (skipNanInf) feclearexcept(FE_OVERFLOW);
703
704 correct2 = f->func.f_fma(s[j], 0.0f, 0.0f,
705 CORRECTLY_ROUNDED);
706 correct3 = f->func.f_fma(s[j], -0.0f, 0.0f,
707 CORRECTLY_ROUNDED);
708 correct4 = f->func.f_fma(s[j], 0.0f, -0.0f,
709 CORRECTLY_ROUNDED);
710 correct5 = f->func.f_fma(s[j], -0.0f, -0.0f,
711 CORRECTLY_ROUNDED);
712
713 // Per section 10 paragraph 6, accept any result
714 // if an input or output is a infinity or NaN or
715 // overflow
716 if (!gInfNanSupport)
717 {
718 if (fetestexcept(FE_OVERFLOW)) continue;
719
720 // Note: no double rounding here. Reference
721 // functions calculate in single precision.
722 if (IsFloatInfinity(correct2)
723 || IsFloatNaN(correct2)
724 || IsFloatInfinity(correct3)
725 || IsFloatNaN(correct3)
726 || IsFloatInfinity(correct4)
727 || IsFloatNaN(correct4)
728 || IsFloatInfinity(correct5)
729 || IsFloatNaN(correct5))
730 continue;
731 }
732
733 err2 = Ulp_Error(test, correct2);
734 err3 = Ulp_Error(test, correct3);
735 err4 = Ulp_Error(test, correct4);
736 err5 = Ulp_Error(test, correct5);
737 fail = fail
738 && ((!(fabsf(err2) <= float_ulps))
739 && (!(fabsf(err3) <= float_ulps))
740 && (!(fabsf(err4) <= float_ulps))
741 && (!(fabsf(err5) <= float_ulps)));
742 if (fabsf(err2) < fabsf(err)) err = err2;
743 if (fabsf(err3) < fabsf(err)) err = err3;
744 if (fabsf(err4) < fabsf(err)) err = err4;
745 if (fabsf(err5) < fabsf(err)) err = err5;
746
747 // retry per section 6.5.3.4
748 if (0.0f == test
749 && (0.0f
750 == f->func.f_fma(s[j], 0.0f, 0.0f,
751 FLUSHED)
752 || 0.0f
753 == f->func.f_fma(s[j], -0.0f, 0.0f,
754 FLUSHED)
755 || 0.0f
756 == f->func.f_fma(s[j], 0.0f, -0.0f,
757 FLUSHED)
758 || 0.0f
759 == f->func.f_fma(s[j], -0.0f, -0.0f,
760 FLUSHED)))
761 {
762 fail = 0;
763 err = 0.0f;
764 }
765 }
766 }
767 else if (fail && IsFloatSubnormal(s3[j]))
768 {
769 double correct2, correct3;
770 float err2, err3;
771
772 if (skipNanInf) feclearexcept(FE_OVERFLOW);
773
774 correct2 = f->func.f_fma(s[j], s2[j], 0.0f,
775 CORRECTLY_ROUNDED);
776 correct3 = f->func.f_fma(s[j], s2[j], -0.0f,
777 CORRECTLY_ROUNDED);
778
779 if (skipNanInf)
780 {
781 if (fetestexcept(FE_OVERFLOW)) continue;
782
783 // Note: no double rounding here. Reference
784 // functions calculate in single precision.
785 if (IsFloatInfinity(correct2)
786 || IsFloatNaN(correct2)
787 || IsFloatInfinity(correct3)
788 || IsFloatNaN(correct3))
789 continue;
790 }
791
792 err2 = Ulp_Error(test, correct2);
793 err3 = Ulp_Error(test, correct3);
794 fail = fail
795 && ((!(fabsf(err2) <= float_ulps))
796 && (!(fabsf(err3) <= float_ulps)));
797 if (fabsf(err2) < fabsf(err)) err = err2;
798 if (fabsf(err3) < fabsf(err)) err = err3;
799
800 // retry per section 6.5.3.4
801 if (0.0f == test
802 && (0.0f
803 == f->func.f_fma(s[j], s2[j], 0.0f,
804 FLUSHED)
805 || 0.0f
806 == f->func.f_fma(s[j], s2[j], -0.0f,
807 FLUSHED)))
808 {
809 fail = 0;
810 err = 0.0f;
811 }
812 }
813 }
814
815 if (fabsf(err) > maxError)
816 {
817 maxError = fabsf(err);
818 maxErrorVal = s[j];
819 maxErrorVal2 = s2[j];
820 maxErrorVal3 = s3[j];
821 }
822
823 if (fail)
824 {
825 vlog_error(
826 "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
827 "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n",
828 f->name, sizeNames[k], err, s[j], s2[j], s3[j],
829 ((cl_uint *)s)[j], ((cl_uint *)s2)[j],
830 ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test);
831 error = -1;
832 goto exit;
833 }
834 }
835 }
836 }
837
838 if (0 == (i & 0x0fffffff))
839 {
840 if (gVerboseBruteForce)
841 {
842 vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
843 BUFFER_SIZE);
844 }
845 else
846 {
847 vlog(".");
848 }
849 fflush(stdout);
850 }
851 }
852
853 if (!gSkipCorrectnessTesting)
854 {
855 if (gWimpyMode)
856 vlog("Wimp pass");
857 else
858 vlog("passed");
859
860 vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
861 maxErrorVal3);
862 }
863
864 vlog("\n");
865
866 exit:
867 // Release
868 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
869 {
870 clReleaseKernel(kernels[k]);
871 clReleaseProgram(programs[k]);
872 }
873
874 return error;
875 }
876