1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <climits>
22 #include <cstring>
23 
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)24 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
25                        cl_program *p, bool relaxedMode)
26 {
27     const char *c[] = { "__kernel void math_kernel",
28                         sizeNames[vectorSize],
29                         "( __global float",
30                         sizeNames[vectorSize],
31                         "* out, __global int",
32                         sizeNames[vectorSize],
33                         "* out2, __global float",
34                         sizeNames[vectorSize],
35                         "* in1, __global float",
36                         sizeNames[vectorSize],
37                         "* in2 )\n"
38                         "{\n"
39                         "   size_t i = get_global_id(0);\n"
40                         "   out[i] = ",
41                         name,
42                         "( in1[i], in2[i], out2 + i );\n"
43                         "}\n" };
44 
45     const char *c3[] = {
46         "__kernel void math_kernel",
47         sizeNames[vectorSize],
48         "( __global float* out, __global int* out2, __global float* in, "
49         "__global float* in2)\n"
50         "{\n"
51         "   size_t i = get_global_id(0);\n"
52         "   if( i + 1 < get_global_size(0) )\n"
53         "   {\n"
54         "       float3 f0 = vload3( 0, in + 3 * i );\n"
55         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
56         "       int3 i0 = 0xdeaddead;\n"
57         "       f0 = ",
58         name,
59         "( f0, f1, &i0 );\n"
60         "       vstore3( f0, 0, out + 3*i );\n"
61         "       vstore3( i0, 0, out2 + 3*i );\n"
62         "   }\n"
63         "   else\n"
64         "   {\n"
65         "       size_t parity = i & 1;   // Figure out how many elements are "
66         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
67         "buffer size \n"
68         "       float3 f0;\n"
69         "       float3 f1;\n"
70         "       int3 i0 = 0xdeaddead;\n"
71         "       switch( parity )\n"
72         "       {\n"
73         "           case 1:\n"
74         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
75         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
76         "               break;\n"
77         "           case 0:\n"
78         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
79         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
80         "               break;\n"
81         "       }\n"
82         "       f0 = ",
83         name,
84         "( f0, f1, &i0 );\n"
85         "       switch( parity )\n"
86         "       {\n"
87         "           case 0:\n"
88         "               out[3*i+1] = f0.y; \n"
89         "               out2[3*i+1] = i0.y; \n"
90         "               // fall through\n"
91         "           case 1:\n"
92         "               out[3*i] = f0.x; \n"
93         "               out2[3*i] = i0.x; \n"
94         "               break;\n"
95         "       }\n"
96         "   }\n"
97         "}\n"
98     };
99 
100     const char **kern = c;
101     size_t kernSize = sizeof(c) / sizeof(c[0]);
102 
103     if (sizeValues[vectorSize] == 3)
104     {
105         kern = c3;
106         kernSize = sizeof(c3) / sizeof(c3[0]);
107     }
108 
109     char testName[32];
110     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
111              sizeNames[vectorSize]);
112 
113     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
114 }
115 
116 typedef struct BuildKernelInfo
117 {
118     cl_uint offset; // the first vector size to build
119     cl_kernel *kernels;
120     cl_program *programs;
121     const char *nameInCode;
122     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
123 } BuildKernelInfo;
124 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)125 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
126 {
127     BuildKernelInfo *info = (BuildKernelInfo *)p;
128     cl_uint i = info->offset + job_id;
129     return BuildKernel(info->nameInCode, i, info->kernels + i,
130                        info->programs + i, info->relaxedMode);
131 }
132 
133 typedef struct ComputeReferenceInfoF_
134 {
135     const float *x;
136     const float *y;
137     float *r;
138     int *i;
139     double (*f_ffpI)(double, double, int *);
140     cl_uint lim;
141     cl_uint count;
142 } ComputeReferenceInfoF;
143 
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)144 static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
145 {
146     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
147     cl_uint lim = cri->lim;
148     cl_uint count = cri->count;
149     cl_uint off = jid * count;
150     const float *x = cri->x + off;
151     const float *y = cri->y + off;
152     float *r = cri->r + off;
153     int *i = cri->i + off;
154     double (*f)(double, double, int *) = cri->f_ffpI;
155 
156     if (off + count > lim) count = lim - off;
157 
158     for (cl_uint j = 0; j < count; ++j)
159         r[j] = (float)f((double)x[j], (double)y[j], i + j);
160 
161     return CL_SUCCESS;
162 }
163 
TestFunc_FloatI_Float_Float(const Func * f,MTdata d,bool relaxedMode)164 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
165 {
166     int error;
167 
168     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
169 
170     cl_program programs[VECTOR_SIZE_COUNT];
171     cl_kernel kernels[VECTOR_SIZE_COUNT];
172     float maxError = 0.0f;
173     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
174     int64_t maxError2 = 0;
175     float maxErrorVal = 0.0f;
176     float maxErrorVal2 = 0.0f;
177     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
178 
179     cl_uint threadCount = GetThreadCount();
180 
181     float float_ulps;
182     if (gIsEmbedded)
183         float_ulps = f->float_embedded_ulps;
184     else
185         float_ulps = f->float_ulps;
186 
187     int testingRemquo = !strcmp(f->name, "remquo");
188 
189     // Init the kernels
190     {
191         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
192                                        f->nameInCode, relaxedMode };
193         if ((error = ThreadPool_Do(BuildKernelFn,
194                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
195                                    &build_info)))
196             return error;
197     }
198 
199     for (uint64_t i = 0; i < (1ULL << 32); i += step)
200     {
201         // Init input array
202         cl_uint *p = (cl_uint *)gIn;
203         cl_uint *p2 = (cl_uint *)gIn2;
204         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
205         {
206             p[j] = genrand_int32(d);
207             p2[j] = genrand_int32(d);
208         }
209 
210         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
211                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
212         {
213             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
214             return error;
215         }
216 
217         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
218                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
219         {
220             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
221             return error;
222         }
223 
224         // write garbage into output arrays
225         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
226         {
227             uint32_t pattern = 0xffffdead;
228             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
229             if ((error =
230                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
231                                           BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
232             {
233                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
234                            error, j);
235                 goto exit;
236             }
237 
238             memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
239             if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
240                                               0, BUFFER_SIZE, gOut2[j], 0, NULL,
241                                               NULL)))
242             {
243                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
244                            error, j);
245                 goto exit;
246             }
247         }
248 
249         // Run the kernels
250         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
251         {
252             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
253             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
254                 / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
255             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
256                                         &gOutBuffer[j])))
257             {
258                 LogBuildError(programs[j]);
259                 goto exit;
260             }
261             if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
262                                         &gOutBuffer2[j])))
263             {
264                 LogBuildError(programs[j]);
265                 goto exit;
266             }
267             if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
268                                         &gInBuffer)))
269             {
270                 LogBuildError(programs[j]);
271                 goto exit;
272             }
273             if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
274                                         &gInBuffer2)))
275             {
276                 LogBuildError(programs[j]);
277                 goto exit;
278             }
279 
280             if ((error =
281                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
282                                             &localCount, NULL, 0, NULL, NULL)))
283             {
284                 vlog_error("FAILED -- could not execute kernel\n");
285                 goto exit;
286             }
287         }
288 
289         // Get that moving
290         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
291 
292         // Calculate the correctly rounded reference result
293         float *s = (float *)gIn;
294         float *s2 = (float *)gIn2;
295 
296         if (threadCount > 1)
297         {
298             ComputeReferenceInfoF cri;
299             cri.x = s;
300             cri.y = s2;
301             cri.r = (float *)gOut_Ref;
302             cri.i = (int *)gOut_Ref2;
303             cri.f_ffpI = f->func.f_ffpI;
304             cri.lim = BUFFER_SIZE / sizeof(float);
305             cri.count = (cri.lim + threadCount - 1) / threadCount;
306             ThreadPool_Do(ReferenceF, threadCount, &cri);
307         }
308         else
309         {
310             float *r = (float *)gOut_Ref;
311             int *r2 = (int *)gOut_Ref2;
312             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
313                 r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
314         }
315 
316         // Read the data back
317         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
318         {
319             if ((error =
320                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
321                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
322             {
323                 vlog_error("ReadArray failed %d\n", error);
324                 goto exit;
325             }
326             if ((error =
327                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
328                                          BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
329             {
330                 vlog_error("ReadArray2 failed %d\n", error);
331                 goto exit;
332             }
333         }
334 
335         if (gSkipCorrectnessTesting) break;
336 
337         // Verify data
338         uint32_t *t = (uint32_t *)gOut_Ref;
339         int32_t *t2 = (int32_t *)gOut_Ref2;
340         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
341         {
342             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
343             {
344                 uint32_t *q = (uint32_t *)(gOut[k]);
345                 int32_t *q2 = (int32_t *)gOut2[k];
346 
347                 // Check for exact match to correctly rounded result
348                 if (t[j] == q[j] && t2[j] == q2[j]) continue;
349 
350                 // Check for paired NaNs
351                 if ((t[j] & 0x7fffffff) > 0x7f800000
352                     && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
353                     continue;
354 
355                 float test = ((float *)q)[j];
356                 int correct2 = INT_MIN;
357                 double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
358                 float err = Ulp_Error(test, correct);
359                 int64_t iErr;
360 
361                 // in case of remquo, we only care about the sign and last
362                 // seven bits of integer as per the spec.
363                 if (testingRemquo)
364                     iErr = (long long)(q2[j] & 0x0000007f)
365                         - (long long)(correct2 & 0x0000007f);
366                 else
367                     iErr = (long long)q2[j] - (long long)correct2;
368 
369                 // For remquo, if y = 0, x is infinite, or either is NaN
370                 // then the standard either neglects to say what is returned
371                 // in iptr or leaves it undefined or implementation defined.
372                 int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
373                     || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
374                     || isnan(((float *)gIn)[j]);
375                 if (iptrUndefined) iErr = 0;
376 
377                 int fail = !(fabsf(err) <= float_ulps && iErr == 0);
378                 if (ftz && fail)
379                 {
380                     // retry per section 6.5.3.2
381                     if (IsFloatResultSubnormal(correct, float_ulps))
382                     {
383                         fail = fail && !(test == 0.0f && iErr == 0);
384                         if (!fail) err = 0.0f;
385                     }
386 
387                     // retry per section 6.5.3.3
388                     if (IsFloatSubnormal(s[j]))
389                     {
390                         int correct3i, correct4i;
391                         double correct3 =
392                             f->func.f_ffpI(0.0, s2[j], &correct3i);
393                         double correct4 =
394                             f->func.f_ffpI(-0.0, s2[j], &correct4i);
395                         float err2 = Ulp_Error(test, correct3);
396                         float err3 = Ulp_Error(test, correct4);
397                         int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
398                         int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
399                         fail = fail
400                             && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
401                                 && (!(fabsf(err3) <= float_ulps
402                                       && iErr4 == 0)));
403                         if (fabsf(err2) < fabsf(err)) err = err2;
404                         if (fabsf(err3) < fabsf(err)) err = err3;
405                         if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
406                         if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
407 
408                         // retry per section 6.5.3.4
409                         if (IsFloatResultSubnormal(correct2, float_ulps)
410                             || IsFloatResultSubnormal(correct3, float_ulps))
411                         {
412                             fail = fail
413                                 && !(test == 0.0f
414                                      && (iErr3 == 0 || iErr4 == 0));
415                             if (!fail) err = 0.0f;
416                         }
417 
418                         // try with both args as zero
419                         if (IsFloatSubnormal(s2[j]))
420                         {
421                             int correct7i, correct8i;
422                             correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
423                             correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
424                             double correct7 =
425                                 f->func.f_ffpI(0.0, -0.0, &correct7i);
426                             double correct8 =
427                                 f->func.f_ffpI(-0.0, -0.0, &correct8i);
428                             err2 = Ulp_Error(test, correct3);
429                             err3 = Ulp_Error(test, correct4);
430                             float err4 = Ulp_Error(test, correct7);
431                             float err5 = Ulp_Error(test, correct8);
432                             iErr3 = (long long)q2[j] - (long long)correct3i;
433                             iErr4 = (long long)q2[j] - (long long)correct4i;
434                             int64_t iErr7 =
435                                 (long long)q2[j] - (long long)correct7i;
436                             int64_t iErr8 =
437                                 (long long)q2[j] - (long long)correct8i;
438                             fail = fail
439                                 && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
440                                     && (!(fabsf(err3) <= float_ulps
441                                           && iErr4 == 0))
442                                     && (!(fabsf(err4) <= float_ulps
443                                           && iErr7 == 0))
444                                     && (!(fabsf(err5) <= float_ulps
445                                           && iErr8 == 0)));
446                             if (fabsf(err2) < fabsf(err)) err = err2;
447                             if (fabsf(err3) < fabsf(err)) err = err3;
448                             if (fabsf(err4) < fabsf(err)) err = err4;
449                             if (fabsf(err5) < fabsf(err)) err = err5;
450                             if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
451                             if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
452                             if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
453                             if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
454 
455                             // retry per section 6.5.3.4
456                             if (IsFloatResultSubnormal(correct3, float_ulps)
457                                 || IsFloatResultSubnormal(correct4, float_ulps)
458                                 || IsFloatResultSubnormal(correct7, float_ulps)
459                                 || IsFloatResultSubnormal(correct8, float_ulps))
460                             {
461                                 fail = fail
462                                     && !(test == 0.0f
463                                          && (iErr3 == 0 || iErr4 == 0
464                                              || iErr7 == 0 || iErr8 == 0));
465                                 if (!fail) err = 0.0f;
466                             }
467                         }
468                     }
469                     else if (IsFloatSubnormal(s2[j]))
470                     {
471                         int correct3i, correct4i;
472                         double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
473                         double correct4 =
474                             f->func.f_ffpI(s[j], -0.0, &correct4i);
475                         float err2 = Ulp_Error(test, correct3);
476                         float err3 = Ulp_Error(test, correct4);
477                         int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
478                         int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
479                         fail = fail
480                             && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
481                                 && (!(fabsf(err3) <= float_ulps
482                                       && iErr4 == 0)));
483                         if (fabsf(err2) < fabsf(err)) err = err2;
484                         if (fabsf(err3) < fabsf(err)) err = err3;
485                         if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
486                         if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
487 
488                         // retry per section 6.5.3.4
489                         if (IsFloatResultSubnormal(correct2, float_ulps)
490                             || IsFloatResultSubnormal(correct3, float_ulps))
491                         {
492                             fail = fail
493                                 && !(test == 0.0f
494                                      && (iErr3 == 0 || iErr4 == 0));
495                             if (!fail) err = 0.0f;
496                         }
497                     }
498                 }
499                 if (fabsf(err) > maxError)
500                 {
501                     maxError = fabsf(err);
502                     maxErrorVal = s[j];
503                 }
504                 if (llabs(iErr) > maxError2)
505                 {
506                     maxError2 = llabs(iErr);
507                     maxErrorVal2 = s[j];
508                 }
509 
510                 if (fail)
511                 {
512                     vlog_error(
513                         "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
514                         "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
515                         "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
516                         f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
517                         ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
518                         ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
519                         ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
520                         ((cl_uint *)gOut_Ref2)[j], test, q2[j],
521                         ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
522                     error = -1;
523                     goto exit;
524                 }
525             }
526         }
527 
528         if (0 == (i & 0x0fffffff))
529         {
530             if (gVerboseBruteForce)
531             {
532                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
533                      BUFFER_SIZE);
534             }
535             else
536             {
537                 vlog(".");
538             }
539             fflush(stdout);
540         }
541     }
542 
543     if (!gSkipCorrectnessTesting)
544     {
545         if (gWimpyMode)
546             vlog("Wimp pass");
547         else
548             vlog("passed");
549 
550         vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
551              maxErrorVal2);
552     }
553 
554     vlog("\n");
555 
556 exit:
557     // Release
558     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
559     {
560         clReleaseKernel(kernels[k]);
561         clReleaseProgram(programs[k]);
562     }
563 
564     return error;
565 }
566