1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20 
21 #include <cstring>
22 
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
24                        cl_program *p, bool relaxedMode)
25 {
26     const char *c[] = { "__kernel void math_kernel",
27                         sizeNames[vectorSize],
28                         "( __global float",
29                         sizeNames[vectorSize],
30                         "* out, __global uint",
31                         sizeNames[vectorSize],
32                         "* in )\n"
33                         "{\n"
34                         "   size_t i = get_global_id(0);\n"
35                         "   out[i] = ",
36                         name,
37                         "( in[i] );\n"
38                         "}\n" };
39 
40     const char *c3[] = {
41         "__kernel void math_kernel",
42         sizeNames[vectorSize],
43         "( __global float* out, __global uint* in)\n"
44         "{\n"
45         "   size_t i = get_global_id(0);\n"
46         "   if( i + 1 < get_global_size(0) )\n"
47         "   {\n"
48         "       uint3 u0 = vload3( 0, in + 3 * i );\n"
49         "       float3 f0 = ",
50         name,
51         "( u0 );\n"
52         "       vstore3( f0, 0, out + 3*i );\n"
53         "   }\n"
54         "   else\n"
55         "   {\n"
56         "       size_t parity = i & 1;   // Figure out how many elements are "
57         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
58         "buffer size \n"
59         "       uint3 u0;\n"
60         "       float3 f0;\n"
61         "       switch( parity )\n"
62         "       {\n"
63         "           case 1:\n"
64         "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
65         "               break;\n"
66         "           case 0:\n"
67         "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
68         "               break;\n"
69         "       }\n"
70         "       f0 = ",
71         name,
72         "( u0 );\n"
73         "       switch( parity )\n"
74         "       {\n"
75         "           case 0:\n"
76         "               out[3*i+1] = f0.y; \n"
77         "               // fall through\n"
78         "           case 1:\n"
79         "               out[3*i] = f0.x; \n"
80         "               break;\n"
81         "       }\n"
82         "   }\n"
83         "}\n"
84     };
85 
86     const char **kern = c;
87     size_t kernSize = sizeof(c) / sizeof(c[0]);
88 
89     if (sizeValues[vectorSize] == 3)
90     {
91         kern = c3;
92         kernSize = sizeof(c3) / sizeof(c3[0]);
93     }
94 
95     char testName[32];
96     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
97              sizeNames[vectorSize]);
98 
99     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
100 }
101 
102 typedef struct BuildKernelInfo
103 {
104     cl_uint offset; // the first vector size to build
105     cl_kernel *kernels;
106     cl_program *programs;
107     const char *nameInCode;
108     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
109 } BuildKernelInfo;
110 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)111 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
112 {
113     BuildKernelInfo *info = (BuildKernelInfo *)p;
114     cl_uint i = info->offset + job_id;
115     return BuildKernel(info->nameInCode, i, info->kernels + i,
116                        info->programs + i, info->relaxedMode);
117 }
118 
TestFunc_Float_UInt(const Func * f,MTdata d,bool relaxedMode)119 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
120 {
121     int error;
122     cl_program programs[VECTOR_SIZE_COUNT];
123     cl_kernel kernels[VECTOR_SIZE_COUNT];
124     float maxError = 0.0f;
125     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
126     float maxErrorVal = 0.0f;
127     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
128     int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(double)) + 1);
129 
130     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
131 
132     float float_ulps;
133     if (gIsEmbedded)
134         float_ulps = f->float_embedded_ulps;
135     else
136         float_ulps = f->float_ulps;
137 
138     // Init the kernels
139     {
140         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
141                                        f->nameInCode, relaxedMode };
142         if ((error = ThreadPool_Do(BuildKernelFn,
143                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
144                                    &build_info)))
145             return error;
146     }
147 
148     for (uint64_t i = 0; i < (1ULL << 32); i += step)
149     {
150         // Init input array
151         uint32_t *p = (uint32_t *)gIn;
152         if (gWimpyMode)
153         {
154             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
155                 p[j] = (uint32_t)i + j * scale;
156         }
157         else
158         {
159             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
160                 p[j] = (uint32_t)i + j;
161         }
162         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
163                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
164         {
165             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
166             return error;
167         }
168 
169         // write garbage into output arrays
170         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
171         {
172             uint32_t pattern = 0xffffdead;
173             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
174             if ((error =
175                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
176                                           BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
177             {
178                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
179                            error, j);
180                 goto exit;
181             }
182         }
183 
184         // Run the kernels
185         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
186         {
187             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
188             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
189             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
190                                         &gOutBuffer[j])))
191             {
192                 LogBuildError(programs[j]);
193                 goto exit;
194             }
195             if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
196                                         &gInBuffer)))
197             {
198                 LogBuildError(programs[j]);
199                 goto exit;
200             }
201 
202             if ((error =
203                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
204                                             &localCount, NULL, 0, NULL, NULL)))
205             {
206                 vlog_error("FAILED -- could not execute kernel\n");
207                 goto exit;
208             }
209         }
210 
211         // Get that moving
212         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
213 
214         // Calculate the correctly rounded reference result
215         float *r = (float *)gOut_Ref;
216         cl_uint *s = (cl_uint *)gIn;
217         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
218             r[j] = (float)f->func.f_u(s[j]);
219 
220         // Read the data back
221         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
222         {
223             if ((error =
224                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
225                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
226             {
227                 vlog_error("ReadArray failed %d\n", error);
228                 goto exit;
229             }
230         }
231 
232         if (gSkipCorrectnessTesting) break;
233 
234         // Verify data
235         uint32_t *t = (uint32_t *)gOut_Ref;
236         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
237         {
238             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
239             {
240                 uint32_t *q = (uint32_t *)(gOut[k]);
241 
242                 // If we aren't getting the correctly rounded result
243                 if (t[j] != q[j])
244                 {
245                     float test = ((float *)q)[j];
246                     double correct = f->func.f_u(s[j]);
247                     float err = Ulp_Error(test, correct);
248                     int fail = !(fabsf(err) <= float_ulps);
249 
250                     if (fail)
251                     {
252                         if (ftz)
253                         {
254                             // retry per section 6.5.3.2
255                             if (IsFloatResultSubnormal(correct, float_ulps))
256                             {
257                                 fail = fail && (test != 0.0f);
258                                 if (!fail) err = 0.0f;
259                             }
260                         }
261                     }
262                     if (fabsf(err) > maxError)
263                     {
264                         maxError = fabsf(err);
265                         maxErrorVal = s[j];
266                     }
267                     if (fail)
268                     {
269                         vlog_error(
270                             "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n",
271                             f->name, sizeNames[k], err, ((uint32_t *)gIn)[j],
272                             ((float *)gOut_Ref)[j], test);
273                         error = -1;
274                         goto exit;
275                     }
276                 }
277             }
278         }
279 
280         if (0 == (i & 0x0fffffff))
281         {
282             if (gVerboseBruteForce)
283             {
284                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
285                      BUFFER_SIZE);
286             }
287             else
288             {
289                 vlog(".");
290             }
291             fflush(stdout);
292         }
293     }
294 
295     if (!gSkipCorrectnessTesting)
296     {
297         if (gWimpyMode)
298             vlog("Wimp pass");
299         else
300             vlog("passed");
301 
302         vlog("\t%8.2f @ %a", maxError, maxErrorVal);
303     }
304 
305     vlog("\n");
306 
307 exit:
308     // Release
309     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
310     {
311         clReleaseKernel(kernels[k]);
312         clReleaseProgram(programs[k]);
313     }
314 
315     return error;
316 }
317