1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
23 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
24
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)25 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
26 cl_kernel *k, cl_program *p, bool relaxedMode)
27 {
28 const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
29 "__kernel void math_kernel",
30 sizeNames[vectorSize],
31 "( __global double",
32 sizeNames[vectorSize],
33 "* out, __global double",
34 sizeNames[vectorSize],
35 "* in1, __global double",
36 sizeNames[vectorSize],
37 "* in2 )\n"
38 "{\n"
39 " size_t i = get_global_id(0);\n"
40 " out[i] = ",
41 name,
42 "( in1[i], in2[i] );\n"
43 "}\n" };
44
45 const char *c3[] = {
46 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
47 "__kernel void math_kernel",
48 sizeNames[vectorSize],
49 "( __global double* out, __global double* in, __global double* in2)\n"
50 "{\n"
51 " size_t i = get_global_id(0);\n"
52 " if( i + 1 < get_global_size(0) )\n"
53 " {\n"
54 " double3 d0 = vload3( 0, in + 3 * i );\n"
55 " double3 d1 = vload3( 0, in2 + 3 * i );\n"
56 " d0 = ",
57 name,
58 "( d0, d1 );\n"
59 " vstore3( d0, 0, out + 3*i );\n"
60 " }\n"
61 " else\n"
62 " {\n"
63 " size_t parity = i & 1; // Figure out how many elements are "
64 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65 "buffer size \n"
66 " double3 d0;\n"
67 " double3 d1;\n"
68 " switch( parity )\n"
69 " {\n"
70 " case 1:\n"
71 " d0 = (double3)( in[3*i], NAN, NAN ); \n"
72 " d1 = (double3)( in2[3*i], NAN, NAN ); \n"
73 " break;\n"
74 " case 0:\n"
75 " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
76 " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
77 " break;\n"
78 " }\n"
79 " d0 = ",
80 name,
81 "( d0, d1 );\n"
82 " switch( parity )\n"
83 " {\n"
84 " case 0:\n"
85 " out[3*i+1] = d0.y; \n"
86 " // fall through\n"
87 " case 1:\n"
88 " out[3*i] = d0.x; \n"
89 " break;\n"
90 " }\n"
91 " }\n"
92 "}\n"
93 };
94
95 const char **kern = c;
96 size_t kernSize = sizeof(c) / sizeof(c[0]);
97
98 if (sizeValues[vectorSize] == 3)
99 {
100 kern = c3;
101 kernSize = sizeof(c3) / sizeof(c3[0]);
102 }
103
104 char testName[32];
105 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106 sizeNames[vectorSize]);
107
108 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
109 relaxedMode);
110 }
111
112 typedef struct BuildKernelInfo
113 {
114 cl_uint offset; // the first vector size to build
115 cl_uint kernel_count;
116 cl_kernel **kernels;
117 cl_program *programs;
118 const char *nameInCode;
119 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
120 } BuildKernelInfo;
121
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)122 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
123 {
124 BuildKernelInfo *info = (BuildKernelInfo *)p;
125 cl_uint i = info->offset + job_id;
126 return BuildKernel(info->nameInCode, i, info->kernel_count,
127 info->kernels[i], info->programs + i, info->relaxedMode);
128 }
129
130 // Thread specific data for a worker thread
131 typedef struct ThreadInfo
132 {
133 cl_mem inBuf; // input buffer for the thread
134 cl_mem inBuf2; // input buffer for the thread
135 cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
136 float maxError; // max error value. Init to 0.
137 double
138 maxErrorValue; // position of the max error value (param 1). Init to 0.
139 double maxErrorValue2; // position of the max error value (param 2). Init
140 // to 0.
141 MTdata d;
142 cl_command_queue tQueue; // per thread command queue to improve performance
143 } ThreadInfo;
144
145 typedef struct TestInfo
146 {
147 size_t subBufferSize; // Size of the sub-buffer in elements
148 const Func *f; // A pointer to the function info
149 cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
150 cl_kernel
151 *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
152 // worker thread: k[vector_size][thread_id]
153 ThreadInfo *
154 tinfo; // An array of thread specific information for each worker thread
155 cl_uint threadCount; // Number of worker threads
156 cl_uint jobCount; // Number of jobs
157 cl_uint step; // step between each chunk and the next.
158 cl_uint scale; // stride between individual test values
159 float ulps; // max_allowed ulps
160 int ftz; // non-zero if running in flush to zero mode
161
162 int isFDim;
163 int skipNanInf;
164 int isNextafter;
165 bool relaxedMode; // True if test is running in relaxed mode, false
166 // otherwise.
167 } TestInfo;
168
169 // A table of more difficult cases to get right
170 static const double specialValues[] = {
171 -NAN,
172 -INFINITY,
173 -DBL_MAX,
174 MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
175 MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
176 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
177 MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
178 MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
179 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
180 MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
181 MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
182 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
183 MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
184 MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
185 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
186 -1000.0,
187 -100.0,
188 -4.0,
189 -3.5,
190 -3.0,
191 MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
192 -2.5,
193 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
194 -2.0,
195 MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
196 -1.5,
197 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
198 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
199 -1.0,
200 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
201 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
202 -0.5,
203 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
204 MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
205 -0.25,
206 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
207 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
208 -DBL_MIN,
209 MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
210 MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
211 MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
212 MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
213 MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
214 MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
215 MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
216 MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
217 MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
218 MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
219 MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
220 MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
221 MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
222 MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
223 -0.0,
224
225 +NAN,
226 +INFINITY,
227 +DBL_MAX,
228 MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
229 MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
230 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
231 MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
232 MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
233 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
234 MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
235 MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
236 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
237 MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
238 MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
239 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
240 +1000.0,
241 +100.0,
242 +4.0,
243 +3.5,
244 +3.0,
245 MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
246 +2.5,
247 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
248 +2.0,
249 MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
250 +1.5,
251 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
252 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
253 +1.0,
254 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
255 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
256 +0.5,
257 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
258 MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
259 +0.25,
260 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
261 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
262 +DBL_MIN,
263 MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
264 MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
265 MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
266 MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
267 MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
268 MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
269 MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
270 MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
271 MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
272 MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
273 MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
274 MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
275 MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
276 MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
277 +0.0,
278 };
279
280 static size_t specialValuesCount =
281 sizeof(specialValues) / sizeof(specialValues[0]);
282
283 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
284
TestFunc_Double_Double_Double(const Func * f,MTdata d,bool relaxedMode)285 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
286 {
287 TestInfo test_info;
288 cl_int error;
289 float maxError = 0.0f;
290 double maxErrorVal = 0.0;
291 double maxErrorVal2 = 0.0;
292
293 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
294
295 // Init test_info
296 memset(&test_info, 0, sizeof(test_info));
297 test_info.threadCount = GetThreadCount();
298 test_info.subBufferSize = BUFFER_SIZE
299 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
300 test_info.scale = getTestScale(sizeof(cl_double));
301
302 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
303 if (test_info.step / test_info.subBufferSize != test_info.scale)
304 {
305 // there was overflow
306 test_info.jobCount = 1;
307 }
308 else
309 {
310 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
311 }
312
313 test_info.f = f;
314 test_info.ulps = f->double_ulps;
315 test_info.ftz = f->ftz || gForceFTZ;
316
317 test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
318 test_info.skipNanInf = 0;
319 test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
320
321 // cl_kernels aren't thread safe, so we make one for each vector size for
322 // every thread
323 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
324 {
325 size_t array_size = test_info.threadCount * sizeof(cl_kernel);
326 test_info.k[i] = (cl_kernel *)malloc(array_size);
327 if (NULL == test_info.k[i])
328 {
329 vlog_error("Error: Unable to allocate storage for kernels!\n");
330 error = CL_OUT_OF_HOST_MEMORY;
331 goto exit;
332 }
333 memset(test_info.k[i], 0, array_size);
334 }
335 test_info.tinfo =
336 (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
337 if (NULL == test_info.tinfo)
338 {
339 vlog_error(
340 "Error: Unable to allocate storage for thread specific data.\n");
341 error = CL_OUT_OF_HOST_MEMORY;
342 goto exit;
343 }
344 memset(test_info.tinfo, 0,
345 test_info.threadCount * sizeof(*test_info.tinfo));
346 for (cl_uint i = 0; i < test_info.threadCount; i++)
347 {
348 cl_buffer_region region = {
349 i * test_info.subBufferSize * sizeof(cl_double),
350 test_info.subBufferSize * sizeof(cl_double)
351 };
352 test_info.tinfo[i].inBuf =
353 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
354 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
355 if (error || NULL == test_info.tinfo[i].inBuf)
356 {
357 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
358 "region {%zd, %zd}\n",
359 region.origin, region.size);
360 goto exit;
361 }
362 test_info.tinfo[i].inBuf2 =
363 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
364 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
365 if (error || NULL == test_info.tinfo[i].inBuf2)
366 {
367 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
368 "region {%zd, %zd}\n",
369 region.origin, region.size);
370 goto exit;
371 }
372
373 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
374 {
375 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
376 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
377 ®ion, &error);
378 if (error || NULL == test_info.tinfo[i].outBuf[j])
379 {
380 vlog_error("Error: Unable to create sub-buffer of "
381 "gOutBuffer[%d] for region {%zd, %zd}\n",
382 (int)j, region.origin, region.size);
383 goto exit;
384 }
385 }
386 test_info.tinfo[i].tQueue =
387 clCreateCommandQueue(gContext, gDevice, 0, &error);
388 if (NULL == test_info.tinfo[i].tQueue || error)
389 {
390 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
391 goto exit;
392 }
393
394 test_info.tinfo[i].d = init_genrand(genrand_int32(d));
395 }
396
397 // Init the kernels
398 {
399 BuildKernelInfo build_info = {
400 gMinVectorSizeIndex, test_info.threadCount, test_info.k,
401 test_info.programs, f->nameInCode, relaxedMode
402 };
403 if ((error = ThreadPool_Do(BuildKernelFn,
404 gMaxVectorSizeIndex - gMinVectorSizeIndex,
405 &build_info)))
406 goto exit;
407 }
408
409 // Run the kernels
410 if (!gSkipCorrectnessTesting)
411 {
412 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
413
414 // Accumulate the arithmetic errors
415 for (cl_uint i = 0; i < test_info.threadCount; i++)
416 {
417 if (test_info.tinfo[i].maxError > maxError)
418 {
419 maxError = test_info.tinfo[i].maxError;
420 maxErrorVal = test_info.tinfo[i].maxErrorValue;
421 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
422 }
423 }
424
425 if (error) goto exit;
426
427 if (gWimpyMode)
428 vlog("Wimp pass");
429 else
430 vlog("passed");
431
432 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
433 }
434
435 vlog("\n");
436
437 exit:
438 // Release
439 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
440 {
441 clReleaseProgram(test_info.programs[i]);
442 if (test_info.k[i])
443 {
444 for (cl_uint j = 0; j < test_info.threadCount; j++)
445 clReleaseKernel(test_info.k[i][j]);
446
447 free(test_info.k[i]);
448 }
449 }
450 if (test_info.tinfo)
451 {
452 for (cl_uint i = 0; i < test_info.threadCount; i++)
453 {
454 free_mtdata(test_info.tinfo[i].d);
455 clReleaseMemObject(test_info.tinfo[i].inBuf);
456 clReleaseMemObject(test_info.tinfo[i].inBuf2);
457 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
458 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
459 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
460 }
461
462 free(test_info.tinfo);
463 }
464
465 return error;
466 }
467
Test(cl_uint job_id,cl_uint thread_id,void * data)468 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
469 {
470 const TestInfo *job = (const TestInfo *)data;
471 size_t buffer_elements = job->subBufferSize;
472 size_t buffer_size = buffer_elements * sizeof(cl_double);
473 cl_uint base = job_id * (cl_uint)job->step;
474 ThreadInfo *tinfo = job->tinfo + thread_id;
475 float ulps = job->ulps;
476 dptr func = job->f->dfunc;
477 int ftz = job->ftz;
478 MTdata d = tinfo->d;
479 cl_int error;
480 const char *name = job->f->name;
481
482 int isNextafter = job->isNextafter;
483 cl_ulong *t;
484 cl_double *r;
485 cl_double *s;
486 cl_double *s2;
487
488 Force64BitFPUPrecision();
489
490 // start the map of the output arrays
491 cl_event e[VECTOR_SIZE_COUNT];
492 cl_ulong *out[VECTOR_SIZE_COUNT];
493 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
494 {
495 out[j] = (cl_ulong *)clEnqueueMapBuffer(
496 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
497 buffer_size, 0, NULL, e + j, &error);
498 if (error || NULL == out[j])
499 {
500 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
501 error);
502 return error;
503 }
504 }
505
506 // Get that moving
507 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
508
509 // Init input array
510 cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
511 cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
512 cl_uint idx = 0;
513 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
514 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
515
516 if (job_id <= (cl_uint)lastSpecialJobIndex)
517 { // test edge cases
518 cl_double *fp = (cl_double *)p;
519 cl_double *fp2 = (cl_double *)p2;
520 uint32_t x, y;
521
522 x = (job_id * buffer_elements) % specialValuesCount;
523 y = (job_id * buffer_elements) / specialValuesCount;
524
525 for (; idx < buffer_elements; idx++)
526 {
527 fp[idx] = specialValues[x];
528 fp2[idx] = specialValues[y];
529 if (++x >= specialValuesCount)
530 {
531 x = 0;
532 y++;
533 if (y >= specialValuesCount) break;
534 }
535 }
536 }
537
538 // Init any remaining values.
539 for (; idx < buffer_elements; idx++)
540 {
541 p[idx] = genrand_int64(d);
542 p2[idx] = genrand_int64(d);
543 }
544
545 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
546 buffer_size, p, 0, NULL, NULL)))
547 {
548 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
549 goto exit;
550 }
551
552 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
553 buffer_size, p2, 0, NULL, NULL)))
554 {
555 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
556 goto exit;
557 }
558
559 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
560 {
561 // Wait for the map to finish
562 if ((error = clWaitForEvents(1, e + j)))
563 {
564 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
565 goto exit;
566 }
567 if ((error = clReleaseEvent(e[j])))
568 {
569 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
570 goto exit;
571 }
572
573 // Fill the result buffer with garbage, so that old results don't carry
574 // over
575 uint32_t pattern = 0xffffdead;
576 memset_pattern4(out[j], &pattern, buffer_size);
577 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
578 out[j], 0, NULL, NULL)))
579 {
580 vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
581 goto exit;
582 }
583
584 // run the kernel
585 size_t vectorCount =
586 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
587 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
588 // own copy of the cl_kernel
589 cl_program program = job->programs[j];
590
591 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
592 &tinfo->outBuf[j])))
593 {
594 LogBuildError(program);
595 return error;
596 }
597 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
598 &tinfo->inBuf)))
599 {
600 LogBuildError(program);
601 return error;
602 }
603 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
604 &tinfo->inBuf2)))
605 {
606 LogBuildError(program);
607 return error;
608 }
609
610 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
611 &vectorCount, NULL, 0, NULL, NULL)))
612 {
613 vlog_error("FAILED -- could not execute kernel\n");
614 goto exit;
615 }
616 }
617
618 // Get that moving
619 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
620
621 if (gSkipCorrectnessTesting) return CL_SUCCESS;
622
623 // Calculate the correctly rounded reference result
624 r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
625 s = (cl_double *)gIn + thread_id * buffer_elements;
626 s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
627 for (size_t j = 0; j < buffer_elements; j++)
628 r[j] = (cl_double)func.f_ff(s[j], s2[j]);
629
630 // Read the data back -- no need to wait for the first N-1 buffers but wait
631 // for the last buffer. This is an in order queue.
632 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
633 {
634 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
635 out[j] = (cl_ulong *)clEnqueueMapBuffer(
636 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
637 buffer_size, 0, NULL, NULL, &error);
638 if (error || NULL == out[j])
639 {
640 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
641 error);
642 goto exit;
643 }
644 }
645
646 // Verify data
647 t = (cl_ulong *)r;
648 for (size_t j = 0; j < buffer_elements; j++)
649 {
650 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
651 {
652 cl_ulong *q = out[k];
653
654 // If we aren't getting the correctly rounded result
655 if (t[j] != q[j])
656 {
657 cl_double test = ((cl_double *)q)[j];
658 long double correct = func.f_ff(s[j], s2[j]);
659 float err = Bruteforce_Ulp_Error_Double(test, correct);
660 int fail = !(fabsf(err) <= ulps);
661
662 if (fail && ftz)
663 {
664 // retry per section 6.5.3.2
665 if (IsDoubleResultSubnormal(correct, ulps))
666 {
667 fail = fail && (test != 0.0f);
668 if (!fail) err = 0.0f;
669 }
670
671 // nextafter on FTZ platforms may return the smallest
672 // normal float (2^-126) given a denormal or a zero
673 // as the first argument. The rationale here is that
674 // nextafter flushes the argument to zero and then
675 // returns the next representable number in the
676 // direction of the second argument, and since
677 // denorms are considered as zero, the smallest
678 // normal number is the next representable number.
679 // In which case, it should have the same sign as the
680 // second argument.
681 if (isNextafter)
682 {
683 if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
684 {
685 cl_double value = copysign(twoToMinus1022, s2[j]);
686 fail = fail && (test != value);
687 if (!fail) err = 0.0f;
688 }
689 }
690 else
691 {
692 // retry per section 6.5.3.3
693 if (IsDoubleSubnormal(s[j]))
694 {
695 long double correct2 = func.f_ff(0.0, s2[j]);
696 long double correct3 = func.f_ff(-0.0, s2[j]);
697 float err2 =
698 Bruteforce_Ulp_Error_Double(test, correct2);
699 float err3 =
700 Bruteforce_Ulp_Error_Double(test, correct3);
701 fail = fail
702 && ((!(fabsf(err2) <= ulps))
703 && (!(fabsf(err3) <= ulps)));
704 if (fabsf(err2) < fabsf(err)) err = err2;
705 if (fabsf(err3) < fabsf(err)) err = err3;
706
707 // retry per section 6.5.3.4
708 if (IsDoubleResultSubnormal(correct2, ulps)
709 || IsDoubleResultSubnormal(correct3, ulps))
710 {
711 fail = fail && (test != 0.0f);
712 if (!fail) err = 0.0f;
713 }
714
715 // try with both args as zero
716 if (IsDoubleSubnormal(s2[j]))
717 {
718 correct2 = func.f_ff(0.0, 0.0);
719 correct3 = func.f_ff(-0.0, 0.0);
720 long double correct4 = func.f_ff(0.0, -0.0);
721 long double correct5 = func.f_ff(-0.0, -0.0);
722 err2 =
723 Bruteforce_Ulp_Error_Double(test, correct2);
724 err3 =
725 Bruteforce_Ulp_Error_Double(test, correct3);
726 float err4 =
727 Bruteforce_Ulp_Error_Double(test, correct4);
728 float err5 =
729 Bruteforce_Ulp_Error_Double(test, correct5);
730 fail = fail
731 && ((!(fabsf(err2) <= ulps))
732 && (!(fabsf(err3) <= ulps))
733 && (!(fabsf(err4) <= ulps))
734 && (!(fabsf(err5) <= ulps)));
735 if (fabsf(err2) < fabsf(err)) err = err2;
736 if (fabsf(err3) < fabsf(err)) err = err3;
737 if (fabsf(err4) < fabsf(err)) err = err4;
738 if (fabsf(err5) < fabsf(err)) err = err5;
739
740 // retry per section 6.5.3.4
741 if (IsDoubleResultSubnormal(correct2, ulps)
742 || IsDoubleResultSubnormal(correct3, ulps)
743 || IsDoubleResultSubnormal(correct4, ulps)
744 || IsDoubleResultSubnormal(correct5, ulps))
745 {
746 fail = fail && (test != 0.0f);
747 if (!fail) err = 0.0f;
748 }
749 }
750 }
751 else if (IsDoubleSubnormal(s2[j]))
752 {
753 long double correct2 = func.f_ff(s[j], 0.0);
754 long double correct3 = func.f_ff(s[j], -0.0);
755 float err2 =
756 Bruteforce_Ulp_Error_Double(test, correct2);
757 float err3 =
758 Bruteforce_Ulp_Error_Double(test, correct3);
759 fail = fail
760 && ((!(fabsf(err2) <= ulps))
761 && (!(fabsf(err3) <= ulps)));
762 if (fabsf(err2) < fabsf(err)) err = err2;
763 if (fabsf(err3) < fabsf(err)) err = err3;
764
765 // retry per section 6.5.3.4
766 if (IsDoubleResultSubnormal(correct2, ulps)
767 || IsDoubleResultSubnormal(correct3, ulps))
768 {
769 fail = fail && (test != 0.0f);
770 if (!fail) err = 0.0f;
771 }
772 }
773 }
774 }
775
776 if (fabsf(err) > tinfo->maxError)
777 {
778 tinfo->maxError = fabsf(err);
779 tinfo->maxErrorValue = s[j];
780 tinfo->maxErrorValue2 = s2[j];
781 }
782 if (fail)
783 {
784 vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
785 "%.13la}: *%.13la vs. %.13la\n",
786 name, sizeNames[k], err, s[j], s2[j], r[j],
787 test);
788 error = -1;
789 goto exit;
790 }
791 }
792 }
793 }
794
795 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
796 {
797 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
798 out[j], 0, NULL, NULL)))
799 {
800 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
801 j, error);
802 return error;
803 }
804 }
805
806 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
807
808
809 if (0 == (base & 0x0fffffff))
810 {
811 if (gVerboseBruteForce)
812 {
813 vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
814 "ThreadCount:%2u\n",
815 base, job->step, job->scale, buffer_elements, job->ulps,
816 job->threadCount);
817 }
818 else
819 {
820 vlog(".");
821 }
822 fflush(stdout);
823 }
824
825 exit:
826 return error;
827 }
828