1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
BuildKernel(const char * operator_symbol,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *operator_symbol, int vectorSize,
24 cl_uint kernel_count, cl_kernel *k, cl_program *p,
25 bool relaxedMode)
26 {
27 const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
28 "__kernel void math_kernel",
29 sizeNames[vectorSize],
30 "( __global double",
31 sizeNames[vectorSize],
32 "* out, __global double",
33 sizeNames[vectorSize],
34 "* in1, __global double",
35 sizeNames[vectorSize],
36 "* in2 )\n"
37 "{\n"
38 " size_t i = get_global_id(0);\n"
39 " out[i] = in1[i] ",
40 operator_symbol,
41 " in2[i];\n"
42 "}\n" };
43
44 const char *c3[] = {
45 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
46 "__kernel void math_kernel",
47 sizeNames[vectorSize],
48 "( __global double* out, __global double* in, __global double* in2)\n"
49 "{\n"
50 " size_t i = get_global_id(0);\n"
51 " if( i + 1 < get_global_size(0) )\n"
52 " {\n"
53 " double3 d0 = vload3( 0, in + 3 * i );\n"
54 " double3 d1 = vload3( 0, in2 + 3 * i );\n"
55 " d0 = d0 ",
56 operator_symbol,
57 " d1;\n"
58 " vstore3( d0, 0, out + 3*i );\n"
59 " }\n"
60 " else\n"
61 " {\n"
62 " size_t parity = i & 1; // Figure out how many elements are "
63 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
64 "buffer size \n"
65 " double3 d0;\n"
66 " double3 d1;\n"
67 " switch( parity )\n"
68 " {\n"
69 " case 1:\n"
70 " d0 = (double3)( in[3*i], NAN, NAN ); \n"
71 " d1 = (double3)( in2[3*i], NAN, NAN ); \n"
72 " break;\n"
73 " case 0:\n"
74 " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
75 " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
76 " break;\n"
77 " }\n"
78 " d0 = d0 ",
79 operator_symbol,
80 " d1;\n"
81 " switch( parity )\n"
82 " {\n"
83 " case 0:\n"
84 " out[3*i+1] = d0.y; \n"
85 " // fall through\n"
86 " case 1:\n"
87 " out[3*i] = d0.x; \n"
88 " break;\n"
89 " }\n"
90 " }\n"
91 "}\n"
92 };
93
94 const char **kern = c;
95 size_t kernSize = sizeof(c) / sizeof(c[0]);
96
97 if (sizeValues[vectorSize] == 3)
98 {
99 kern = c3;
100 kernSize = sizeof(c3) / sizeof(c3[0]);
101 }
102
103 char testName[32];
104 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
105 sizeNames[vectorSize]);
106
107 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
108 relaxedMode);
109 }
110
111 typedef struct BuildKernelInfo
112 {
113 cl_uint offset; // the first vector size to build
114 cl_uint kernel_count;
115 cl_kernel **kernels;
116 cl_program *programs;
117 const char *operator_symbol;
118 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
119 } BuildKernelInfo;
120
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)121 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
122 {
123 BuildKernelInfo *info = (BuildKernelInfo *)p;
124 cl_uint i = info->offset + job_id;
125 return BuildKernel(info->operator_symbol, i, info->kernel_count,
126 info->kernels[i], info->programs + i, info->relaxedMode);
127 }
128
129 // Thread specific data for a worker thread
130 typedef struct ThreadInfo
131 {
132 cl_mem inBuf; // input buffer for the thread
133 cl_mem inBuf2; // input buffer for the thread
134 cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
135 float maxError; // max error value. Init to 0.
136 double
137 maxErrorValue; // position of the max error value (param 1). Init to 0.
138 double maxErrorValue2; // position of the max error value (param 2). Init
139 // to 0.
140 MTdata d;
141 cl_command_queue tQueue; // per thread command queue to improve performance
142 } ThreadInfo;
143
144 typedef struct TestInfo
145 {
146 size_t subBufferSize; // Size of the sub-buffer in elements
147 const Func *f; // A pointer to the function info
148 cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
149 cl_kernel
150 *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
151 // worker thread: k[vector_size][thread_id]
152 ThreadInfo *
153 tinfo; // An array of thread specific information for each worker thread
154 cl_uint threadCount; // Number of worker threads
155 cl_uint jobCount; // Number of jobs
156 cl_uint step; // step between each chunk and the next.
157 cl_uint scale; // stride between individual test values
158 float ulps; // max_allowed ulps
159 int ftz; // non-zero if running in flush to zero mode
160 bool relaxedMode; // True if the test is being run in relaxed mode, false
161 // otherwise.
162
163 // no special fields
164 } TestInfo;
165
166 // A table of more difficult cases to get right
167 static const double specialValues[] = {
168 -NAN,
169 -INFINITY,
170 -DBL_MAX,
171 MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
172 MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
173 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
174 MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
175 MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
176 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
177 MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
178 MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
179 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
180 MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
181 MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
182 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
183 -1000.,
184 -100.,
185 -4.0,
186 -3.5,
187 -3.0,
188 MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
189 -2.5,
190 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
191 -2.0,
192 MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
193 -1.5,
194 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
195 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
196 -1.0,
197 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
198 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
199 -0.5,
200 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
201 MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
202 -0.25,
203 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
204 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
205 -DBL_MIN,
206 MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
207 MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
208 MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
209 MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
210 MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
211 MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
212 MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
213 MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
214 MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
215 MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
216 MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
217 MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
218 MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
219 MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
220 -0.0,
221
222 +NAN,
223 +INFINITY,
224 +DBL_MAX,
225 MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
226 MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
227 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
228 MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
229 MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
230 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
231 MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
232 MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
233 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
234 MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
235 MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
236 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
237 +1000.0,
238 +100.0,
239 +4.0,
240 +3.5,
241 +3.0,
242 MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
243 +2.5,
244 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
245 +2.0,
246 MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
247 +1.5,
248 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
249 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
250 +1.0,
251 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
252 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
253 +0.5,
254 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
255 MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
256 +0.25,
257 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
258 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
259 +DBL_MIN,
260 MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
261 MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
262 MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
263 MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
264 MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
265 MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
266 MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
267 MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
268 MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
269 MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
270 MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
271 MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
272 MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
273 MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
274 +0.0,
275 };
276
277 static const size_t specialValuesCount =
278 sizeof(specialValues) / sizeof(specialValues[0]);
279
280 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
281
TestFunc_Double_Double_Double_Operator(const Func * f,MTdata d,bool relaxedMode)282 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
283 bool relaxedMode)
284 {
285 TestInfo test_info;
286 cl_int error;
287 float maxError = 0.0f;
288 double maxErrorVal = 0.0;
289 double maxErrorVal2 = 0.0;
290
291 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
292
293 // Init test_info
294 memset(&test_info, 0, sizeof(test_info));
295 test_info.threadCount = GetThreadCount();
296 test_info.subBufferSize = BUFFER_SIZE
297 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
298 test_info.scale = getTestScale(sizeof(cl_double));
299
300 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
301 if (test_info.step / test_info.subBufferSize != test_info.scale)
302 {
303 // there was overflow
304 test_info.jobCount = 1;
305 }
306 else
307 {
308 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
309 }
310
311 test_info.f = f;
312 test_info.ulps = f->double_ulps;
313 test_info.ftz = f->ftz || gForceFTZ;
314
315 // cl_kernels aren't thread safe, so we make one for each vector size for
316 // every thread
317 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
318 {
319 size_t array_size = test_info.threadCount * sizeof(cl_kernel);
320 test_info.k[i] = (cl_kernel *)malloc(array_size);
321 if (NULL == test_info.k[i])
322 {
323 vlog_error("Error: Unable to allocate storage for kernels!\n");
324 error = CL_OUT_OF_HOST_MEMORY;
325 goto exit;
326 }
327 memset(test_info.k[i], 0, array_size);
328 }
329 test_info.tinfo =
330 (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
331 if (NULL == test_info.tinfo)
332 {
333 vlog_error(
334 "Error: Unable to allocate storage for thread specific data.\n");
335 error = CL_OUT_OF_HOST_MEMORY;
336 goto exit;
337 }
338 memset(test_info.tinfo, 0,
339 test_info.threadCount * sizeof(*test_info.tinfo));
340 for (cl_uint i = 0; i < test_info.threadCount; i++)
341 {
342 cl_buffer_region region = {
343 i * test_info.subBufferSize * sizeof(cl_double),
344 test_info.subBufferSize * sizeof(cl_double)
345 };
346 test_info.tinfo[i].inBuf =
347 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
348 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
349 if (error || NULL == test_info.tinfo[i].inBuf)
350 {
351 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
352 "region {%zd, %zd}\n",
353 region.origin, region.size);
354 goto exit;
355 }
356 test_info.tinfo[i].inBuf2 =
357 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
358 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
359 if (error || NULL == test_info.tinfo[i].inBuf2)
360 {
361 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
362 "region {%zd, %zd}\n",
363 region.origin, region.size);
364 goto exit;
365 }
366
367 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
368 {
369 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
370 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
371 ®ion, &error);
372 if (error || NULL == test_info.tinfo[i].outBuf[j])
373 {
374 vlog_error("Error: Unable to create sub-buffer of "
375 "gOutBuffer[%d] for region {%zd, %zd}\n",
376 (int)j, region.origin, region.size);
377 goto exit;
378 }
379 }
380 test_info.tinfo[i].tQueue =
381 clCreateCommandQueue(gContext, gDevice, 0, &error);
382 if (NULL == test_info.tinfo[i].tQueue || error)
383 {
384 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
385 goto exit;
386 }
387
388 test_info.tinfo[i].d = init_genrand(genrand_int32(d));
389 }
390
391 // Init the kernels
392 {
393 BuildKernelInfo build_info = {
394 gMinVectorSizeIndex, test_info.threadCount, test_info.k,
395 test_info.programs, f->nameInCode, relaxedMode
396 };
397 if ((error = ThreadPool_Do(BuildKernelFn,
398 gMaxVectorSizeIndex - gMinVectorSizeIndex,
399 &build_info)))
400 goto exit;
401 }
402
403 // Run the kernels
404 if (!gSkipCorrectnessTesting)
405 {
406 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
407
408 // Accumulate the arithmetic errors
409 for (cl_uint i = 0; i < test_info.threadCount; i++)
410 {
411 if (test_info.tinfo[i].maxError > maxError)
412 {
413 maxError = test_info.tinfo[i].maxError;
414 maxErrorVal = test_info.tinfo[i].maxErrorValue;
415 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
416 }
417 }
418
419 if (error) goto exit;
420
421 if (gWimpyMode)
422 vlog("Wimp pass");
423 else
424 vlog("passed");
425
426 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
427 }
428
429 vlog("\n");
430
431 exit:
432 // Release
433 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
434 {
435 clReleaseProgram(test_info.programs[i]);
436 if (test_info.k[i])
437 {
438 for (cl_uint j = 0; j < test_info.threadCount; j++)
439 clReleaseKernel(test_info.k[i][j]);
440
441 free(test_info.k[i]);
442 }
443 }
444 if (test_info.tinfo)
445 {
446 for (cl_uint i = 0; i < test_info.threadCount; i++)
447 {
448 free_mtdata(test_info.tinfo[i].d);
449 clReleaseMemObject(test_info.tinfo[i].inBuf);
450 clReleaseMemObject(test_info.tinfo[i].inBuf2);
451 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
452 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
453 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
454 }
455
456 free(test_info.tinfo);
457 }
458
459 return error;
460 }
461
Test(cl_uint job_id,cl_uint thread_id,void * data)462 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
463 {
464 const TestInfo *job = (const TestInfo *)data;
465 size_t buffer_elements = job->subBufferSize;
466 size_t buffer_size = buffer_elements * sizeof(cl_double);
467 cl_uint base = job_id * (cl_uint)job->step;
468 ThreadInfo *tinfo = job->tinfo + thread_id;
469 float ulps = job->ulps;
470 dptr func = job->f->dfunc;
471 int ftz = job->ftz;
472 MTdata d = tinfo->d;
473 cl_int error;
474 const char *name = job->f->name;
475 cl_ulong *t;
476 cl_double *r;
477 cl_double *s;
478 cl_double *s2;
479
480 Force64BitFPUPrecision();
481
482 // start the map of the output arrays
483 cl_event e[VECTOR_SIZE_COUNT];
484 cl_ulong *out[VECTOR_SIZE_COUNT];
485 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
486 {
487 out[j] = (cl_ulong *)clEnqueueMapBuffer(
488 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
489 buffer_size, 0, NULL, e + j, &error);
490 if (error || NULL == out[j])
491 {
492 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
493 error);
494 return error;
495 }
496 }
497
498 // Get that moving
499 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
500
501 // Init input array
502 cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
503 cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
504 cl_uint idx = 0;
505 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
506 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
507
508 if (job_id <= (cl_uint)lastSpecialJobIndex)
509 { // test edge cases
510 cl_double *fp = (cl_double *)p;
511 cl_double *fp2 = (cl_double *)p2;
512 uint32_t x, y;
513
514 x = (job_id * buffer_elements) % specialValuesCount;
515 y = (job_id * buffer_elements) / specialValuesCount;
516
517 for (; idx < buffer_elements; idx++)
518 {
519 fp[idx] = specialValues[x];
520 fp2[idx] = specialValues[y];
521 if (++x >= specialValuesCount)
522 {
523 x = 0;
524 y++;
525 if (y >= specialValuesCount) break;
526 }
527 }
528 }
529
530 // Init any remaining values.
531 for (; idx < buffer_elements; idx++)
532 {
533 p[idx] = genrand_int64(d);
534 p2[idx] = genrand_int64(d);
535 }
536
537 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
538 buffer_size, p, 0, NULL, NULL)))
539 {
540 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
541 goto exit;
542 }
543
544 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
545 buffer_size, p2, 0, NULL, NULL)))
546 {
547 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
548 goto exit;
549 }
550
551 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
552 {
553 // Wait for the map to finish
554 if ((error = clWaitForEvents(1, e + j)))
555 {
556 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
557 goto exit;
558 }
559 if ((error = clReleaseEvent(e[j])))
560 {
561 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
562 goto exit;
563 }
564
565 // Fill the result buffer with garbage, so that old results don't carry
566 // over
567 uint32_t pattern = 0xffffdead;
568 memset_pattern4(out[j], &pattern, buffer_size);
569 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
570 out[j], 0, NULL, NULL)))
571 {
572 vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
573 goto exit;
574 }
575
576 // run the kernel
577 size_t vectorCount =
578 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
579 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
580 // own copy of the cl_kernel
581 cl_program program = job->programs[j];
582
583 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
584 &tinfo->outBuf[j])))
585 {
586 LogBuildError(program);
587 return error;
588 }
589 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
590 &tinfo->inBuf)))
591 {
592 LogBuildError(program);
593 return error;
594 }
595 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
596 &tinfo->inBuf2)))
597 {
598 LogBuildError(program);
599 return error;
600 }
601
602 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
603 &vectorCount, NULL, 0, NULL, NULL)))
604 {
605 vlog_error("FAILED -- could not execute kernel\n");
606 goto exit;
607 }
608 }
609
610 // Get that moving
611 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
612
613 if (gSkipCorrectnessTesting) return CL_SUCCESS;
614
615 // Calculate the correctly rounded reference result
616 r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
617 s = (cl_double *)gIn + thread_id * buffer_elements;
618 s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
619 for (size_t j = 0; j < buffer_elements; j++)
620 r[j] = (cl_double)func.f_ff(s[j], s2[j]);
621
622 // Read the data back -- no need to wait for the first N-1 buffers but wait
623 // for the last buffer. This is an in order queue.
624 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
625 {
626 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
627 out[j] = (cl_ulong *)clEnqueueMapBuffer(
628 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
629 buffer_size, 0, NULL, NULL, &error);
630 if (error || NULL == out[j])
631 {
632 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
633 error);
634 goto exit;
635 }
636 }
637
638 // Verify data
639 t = (cl_ulong *)r;
640 for (size_t j = 0; j < buffer_elements; j++)
641 {
642 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
643 {
644 cl_ulong *q = out[k];
645
646 // If we aren't getting the correctly rounded result
647 if (t[j] != q[j])
648 {
649 cl_double test = ((cl_double *)q)[j];
650 long double correct = func.f_ff(s[j], s2[j]);
651 float err = Bruteforce_Ulp_Error_Double(test, correct);
652 int fail = !(fabsf(err) <= ulps);
653
654 if (fail && ftz)
655 {
656 // retry per section 6.5.3.2
657 if (IsDoubleResultSubnormal(correct, ulps))
658 {
659 fail = fail && (test != 0.0f);
660 if (!fail) err = 0.0f;
661 }
662
663
664 // retry per section 6.5.3.3
665 if (IsDoubleSubnormal(s[j]))
666 {
667 long double correct2 = func.f_ff(0.0, s2[j]);
668 long double correct3 = func.f_ff(-0.0, s2[j]);
669 float err2 =
670 Bruteforce_Ulp_Error_Double(test, correct2);
671 float err3 =
672 Bruteforce_Ulp_Error_Double(test, correct3);
673 fail = fail
674 && ((!(fabsf(err2) <= ulps))
675 && (!(fabsf(err3) <= ulps)));
676 if (fabsf(err2) < fabsf(err)) err = err2;
677 if (fabsf(err3) < fabsf(err)) err = err3;
678
679 // retry per section 6.5.3.4
680 if (IsDoubleResultSubnormal(correct2, ulps)
681 || IsDoubleResultSubnormal(correct3, ulps))
682 {
683 fail = fail && (test != 0.0f);
684 if (!fail) err = 0.0f;
685 }
686
687 // try with both args as zero
688 if (IsDoubleSubnormal(s2[j]))
689 {
690 correct2 = func.f_ff(0.0, 0.0);
691 correct3 = func.f_ff(-0.0, 0.0);
692 long double correct4 = func.f_ff(0.0, -0.0);
693 long double correct5 = func.f_ff(-0.0, -0.0);
694 err2 = Bruteforce_Ulp_Error_Double(test, correct2);
695 err3 = Bruteforce_Ulp_Error_Double(test, correct3);
696 float err4 =
697 Bruteforce_Ulp_Error_Double(test, correct4);
698 float err5 =
699 Bruteforce_Ulp_Error_Double(test, correct5);
700 fail = fail
701 && ((!(fabsf(err2) <= ulps))
702 && (!(fabsf(err3) <= ulps))
703 && (!(fabsf(err4) <= ulps))
704 && (!(fabsf(err5) <= ulps)));
705 if (fabsf(err2) < fabsf(err)) err = err2;
706 if (fabsf(err3) < fabsf(err)) err = err3;
707 if (fabsf(err4) < fabsf(err)) err = err4;
708 if (fabsf(err5) < fabsf(err)) err = err5;
709
710 // retry per section 6.5.3.4
711 if (IsDoubleResultSubnormal(correct2, ulps)
712 || IsDoubleResultSubnormal(correct3, ulps)
713 || IsDoubleResultSubnormal(correct4, ulps)
714 || IsDoubleResultSubnormal(correct5, ulps))
715 {
716 fail = fail && (test != 0.0f);
717 if (!fail) err = 0.0f;
718 }
719 }
720 }
721 else if (IsDoubleSubnormal(s2[j]))
722 {
723 long double correct2 = func.f_ff(s[j], 0.0);
724 long double correct3 = func.f_ff(s[j], -0.0);
725 float err2 =
726 Bruteforce_Ulp_Error_Double(test, correct2);
727 float err3 =
728 Bruteforce_Ulp_Error_Double(test, correct3);
729 fail = fail
730 && ((!(fabsf(err2) <= ulps))
731 && (!(fabsf(err3) <= ulps)));
732 if (fabsf(err2) < fabsf(err)) err = err2;
733 if (fabsf(err3) < fabsf(err)) err = err3;
734
735 // retry per section 6.5.3.4
736 if (IsDoubleResultSubnormal(correct2, ulps)
737 || IsDoubleResultSubnormal(correct3, ulps))
738 {
739 fail = fail && (test != 0.0f);
740 if (!fail) err = 0.0f;
741 }
742 }
743 }
744
745 if (fabsf(err) > tinfo->maxError)
746 {
747 tinfo->maxError = fabsf(err);
748 tinfo->maxErrorValue = s[j];
749 tinfo->maxErrorValue2 = s2[j];
750 }
751 if (fail)
752 {
753 vlog_error(
754 "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
755 name, sizeNames[k], err, s[j], s2[j], r[j], test);
756 error = -1;
757 goto exit;
758 }
759 }
760 }
761 }
762
763 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
764 {
765 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
766 out[j], 0, NULL, NULL)))
767 {
768 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
769 j, error);
770 return error;
771 }
772 }
773
774 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
775
776
777 if (0 == (base & 0x0fffffff))
778 {
779 if (gVerboseBruteForce)
780 {
781 vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
782 "ThreadCount:%2u\n",
783 base, job->step, job->scale, buffer_elements, job->ulps,
784 job->threadCount);
785 }
786 else
787 {
788 vlog(".");
789 }
790 fflush(stdout);
791 }
792
793 exit:
794 return error;
795 }
796