1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <climits>
22 #include <cstring>
23 
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)24 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
25                        cl_program *p, bool relaxedMode)
26 {
27     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
28                         "__kernel void math_kernel",
29                         sizeNames[vectorSize],
30                         "( __global double",
31                         sizeNames[vectorSize],
32                         "* out, __global int",
33                         sizeNames[vectorSize],
34                         "* out2, __global double",
35                         sizeNames[vectorSize],
36                         "* in )\n"
37                         "{\n"
38                         "   size_t i = get_global_id(0);\n"
39                         "   out[i] = ",
40                         name,
41                         "( in[i], out2 + i );\n"
42                         "}\n" };
43 
44     const char *c3[] = {
45         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
46         "__kernel void math_kernel",
47         sizeNames[vectorSize],
48         "( __global double* out, __global int* out2, __global double* in)\n"
49         "{\n"
50         "   size_t i = get_global_id(0);\n"
51         "   if( i + 1 < get_global_size(0) )\n"
52         "   {\n"
53         "       double3 f0 = vload3( 0, in + 3 * i );\n"
54         "       int3 iout = INT_MIN;\n"
55         "       f0 = ",
56         name,
57         "( f0, &iout );\n"
58         "       vstore3( f0, 0, out + 3*i );\n"
59         "       vstore3( iout, 0, out2 + 3*i );\n"
60         "   }\n"
61         "   else\n"
62         "   {\n"
63         "       size_t parity = i & 1;   // Figure out how many elements are "
64         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65         "buffer size \n"
66         "       int3 iout = INT_MIN;\n"
67         "       double3 f0;\n"
68         "       switch( parity )\n"
69         "       {\n"
70         "           case 1:\n"
71         "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
72         "               break;\n"
73         "           case 0:\n"
74         "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
75         "               break;\n"
76         "       }\n"
77         "       f0 = ",
78         name,
79         "( f0, &iout );\n"
80         "       switch( parity )\n"
81         "       {\n"
82         "           case 0:\n"
83         "               out[3*i+1] = f0.y; \n"
84         "               out2[3*i+1] = iout.y; \n"
85         "               // fall through\n"
86         "           case 1:\n"
87         "               out[3*i] = f0.x; \n"
88         "               out2[3*i] = iout.x; \n"
89         "               break;\n"
90         "       }\n"
91         "   }\n"
92         "}\n"
93     };
94 
95     const char **kern = c;
96     size_t kernSize = sizeof(c) / sizeof(c[0]);
97 
98     if (sizeValues[vectorSize] == 3)
99     {
100         kern = c3;
101         kernSize = sizeof(c3) / sizeof(c3[0]);
102     }
103 
104     char testName[32];
105     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106              sizeNames[vectorSize]);
107 
108     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
109 }
110 
111 typedef struct BuildKernelInfo
112 {
113     cl_uint offset; // the first vector size to build
114     cl_kernel *kernels;
115     cl_program *programs;
116     const char *nameInCode;
117     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
118 } BuildKernelInfo;
119 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)120 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
121 {
122     BuildKernelInfo *info = (BuildKernelInfo *)p;
123     cl_uint i = info->offset + job_id;
124     return BuildKernel(info->nameInCode, i, info->kernels + i,
125                        info->programs + i, info->relaxedMode);
126 }
127 
abs_cl_long(cl_long i)128 static cl_ulong abs_cl_long(cl_long i)
129 {
130     cl_long mask = i >> 63;
131     return (i ^ mask) - mask;
132 }
133 
TestFunc_DoubleI_Double(const Func * f,MTdata d,bool relaxedMode)134 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
135 {
136     int error;
137     cl_program programs[VECTOR_SIZE_COUNT];
138     cl_kernel kernels[VECTOR_SIZE_COUNT];
139     float maxError = 0.0f;
140     int64_t maxError2 = 0;
141     int ftz = f->ftz || gForceFTZ;
142     double maxErrorVal = 0.0f;
143     double maxErrorVal2 = 0.0f;
144     cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
145     uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
146     int scale =
147         (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(cl_double)) + 1);
148 
149     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
150 
151     Force64BitFPUPrecision();
152 
153     // Init the kernels
154     {
155         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
156                                        f->nameInCode, relaxedMode };
157         if ((error = ThreadPool_Do(BuildKernelFn,
158                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
159                                    &build_info)))
160             return error;
161     }
162 
163     for (uint64_t i = 0; i < (1ULL << 32); i += step)
164     {
165         // Init input array
166         double *p = (double *)gIn;
167         if (gWimpyMode)
168         {
169             for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
170                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
171         }
172         else
173         {
174             for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
175                 p[j] = DoubleFromUInt32((uint32_t)i + j);
176         }
177         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
178                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
179         {
180             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
181             return error;
182         }
183 
184         // write garbage into output arrays
185         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
186         {
187             uint32_t pattern = 0xffffdead;
188             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
189             if ((error =
190                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
191                                           BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
192             {
193                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
194                            error, j);
195                 goto exit;
196             }
197 
198             memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
199             if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
200                                               0, BUFFER_SIZE, gOut2[j], 0, NULL,
201                                               NULL)))
202             {
203                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
204                            error, j);
205                 goto exit;
206             }
207         }
208 
209         // Run the kernels
210         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
211         {
212             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
213             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
214             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
215                                         &gOutBuffer[j])))
216             {
217                 LogBuildError(programs[j]);
218                 goto exit;
219             }
220             if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
221                                         &gOutBuffer2[j])))
222             {
223                 LogBuildError(programs[j]);
224                 goto exit;
225             }
226             if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
227                                         &gInBuffer)))
228             {
229                 LogBuildError(programs[j]);
230                 goto exit;
231             }
232 
233             if ((error =
234                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
235                                             &localCount, NULL, 0, NULL, NULL)))
236             {
237                 vlog_error("FAILED -- could not execute kernel\n");
238                 goto exit;
239             }
240         }
241 
242         // Get that moving
243         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
244 
245         // Calculate the correctly rounded reference result
246         double *r = (double *)gOut_Ref;
247         int *r2 = (int *)gOut_Ref2;
248         double *s = (double *)gIn;
249         for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
250             r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j);
251 
252         // Read the data back
253         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
254         {
255             if ((error =
256                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
257                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
258             {
259                 vlog_error("ReadArray failed %d\n", error);
260                 goto exit;
261             }
262             if ((error =
263                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
264                                          BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
265             {
266                 vlog_error("ReadArray2 failed %d\n", error);
267                 goto exit;
268             }
269         }
270 
271         if (gSkipCorrectnessTesting) break;
272 
273         // Verify data
274         uint64_t *t = (uint64_t *)gOut_Ref;
275         int32_t *t2 = (int32_t *)gOut_Ref2;
276         for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
277         {
278             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
279             {
280                 uint64_t *q = (uint64_t *)(gOut[k]);
281                 int32_t *q2 = (int32_t *)(gOut2[k]);
282 
283                 // If we aren't getting the correctly rounded result
284                 if (t[j] != q[j] || t2[j] != q2[j])
285                 {
286                     double test = ((double *)q)[j];
287                     int correct2 = INT_MIN;
288                     long double correct = f->dfunc.f_fpI(s[j], &correct2);
289                     float err = Bruteforce_Ulp_Error_Double(test, correct);
290                     cl_long iErr = (long long)q2[j] - (long long)correct2;
291                     int fail = !(fabsf(err) <= f->double_ulps
292                                  && abs_cl_long(iErr) <= maxiError);
293                     if (ftz)
294                     {
295                         // retry per section 6.5.3.2
296                         if (IsDoubleResultSubnormal(correct, f->double_ulps))
297                         {
298                             fail = fail && !(test == 0.0f && iErr == 0);
299                             if (!fail) err = 0.0f;
300                         }
301 
302                         // retry per section 6.5.3.3
303                         if (IsDoubleSubnormal(s[j]))
304                         {
305                             int correct5, correct6;
306                             long double correct3 =
307                                 f->dfunc.f_fpI(0.0, &correct5);
308                             long double correct4 =
309                                 f->dfunc.f_fpI(-0.0, &correct6);
310                             float err2 =
311                                 Bruteforce_Ulp_Error_Double(test, correct3);
312                             float err3 =
313                                 Bruteforce_Ulp_Error_Double(test, correct4);
314                             cl_long iErr2 =
315                                 (long long)q2[j] - (long long)correct5;
316                             cl_long iErr3 =
317                                 (long long)q2[j] - (long long)correct6;
318 
319                             // Did +0 work?
320                             if (fabsf(err2) <= f->double_ulps
321                                 && abs_cl_long(iErr2) <= maxiError)
322                             {
323                                 err = err2;
324                                 iErr = iErr2;
325                                 fail = 0;
326                             }
327                             // Did -0 work?
328                             else if (fabsf(err3) <= f->double_ulps
329                                      && abs_cl_long(iErr3) <= maxiError)
330                             {
331                                 err = err3;
332                                 iErr = iErr3;
333                                 fail = 0;
334                             }
335 
336                             // retry per section 6.5.3.4
337                             if (fail
338                                 && (IsDoubleResultSubnormal(correct2,
339                                                             f->double_ulps)
340                                     || IsDoubleResultSubnormal(correct3,
341                                                                f->double_ulps)))
342                             {
343                                 fail = fail
344                                     && !(test == 0.0f
345                                          && (abs_cl_long(iErr2) <= maxiError
346                                              || abs_cl_long(iErr3)
347                                                  <= maxiError));
348                                 if (!fail)
349                                 {
350                                     err = 0.0f;
351                                     iErr = 0;
352                                 }
353                             }
354                         }
355                     }
356                     if (fabsf(err) > maxError)
357                     {
358                         maxError = fabsf(err);
359                         maxErrorVal = s[j];
360                     }
361                     if (llabs(iErr) > maxError2)
362                     {
363                         maxError2 = llabs(iErr);
364                         maxErrorVal2 = s[j];
365                     }
366 
367                     if (fail)
368                     {
369                         vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at "
370                                    "%.13la: *{%.13la, %d} vs. {%.13la, %d}\n",
371                                    f->name, sizeNames[k], err, (int)iErr,
372                                    ((double *)gIn)[j], ((double *)gOut_Ref)[j],
373                                    ((int *)gOut_Ref2)[j], test, q2[j]);
374                         error = -1;
375                         goto exit;
376                     }
377                 }
378             }
379         }
380 
381         if (0 == (i & 0x0fffffff))
382         {
383             if (gVerboseBruteForce)
384             {
385                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
386                      BUFFER_SIZE);
387             }
388             else
389             {
390                 vlog(".");
391             }
392             fflush(stdout);
393         }
394     }
395 
396     if (!gSkipCorrectnessTesting)
397     {
398         if (gWimpyMode)
399             vlog("Wimp pass");
400         else
401             vlog("passed");
402 
403         vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
404              maxErrorVal2);
405     }
406 
407     vlog("\n");
408 
409 exit:
410     // Release
411     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
412     {
413         clReleaseKernel(kernels[k]);
414         clReleaseProgram(programs[k]);
415     }
416 
417     return error;
418 }
419