1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
BuildKernel(const char * operator_symbol,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *operator_symbol, int vectorSize,
24 cl_uint kernel_count, cl_kernel *k, cl_program *p,
25 bool relaxedMode)
26 {
27 const char *c[] = { "__kernel void math_kernel",
28 sizeNames[vectorSize],
29 "( __global float",
30 sizeNames[vectorSize],
31 "* out, __global float",
32 sizeNames[vectorSize],
33 "* in1, __global float",
34 sizeNames[vectorSize],
35 "* in2 )\n"
36 "{\n"
37 " size_t i = get_global_id(0);\n"
38 " out[i] = in1[i] ",
39 operator_symbol,
40 " in2[i];\n"
41 "}\n" };
42
43 const char *c3[] = {
44 "__kernel void math_kernel",
45 sizeNames[vectorSize],
46 "( __global float* out, __global float* in, __global float* in2)\n"
47 "{\n"
48 " size_t i = get_global_id(0);\n"
49 " if( i + 1 < get_global_size(0) )\n"
50 " {\n"
51 " float3 f0 = vload3( 0, in + 3 * i );\n"
52 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
53 " f0 = f0 ",
54 operator_symbol,
55 " f1;\n"
56 " vstore3( f0, 0, out + 3*i );\n"
57 " }\n"
58 " else\n"
59 " {\n"
60 " size_t parity = i & 1; // Figure out how many elements are "
61 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
62 "buffer size \n"
63 " float3 f0;\n"
64 " float3 f1;\n"
65 " switch( parity )\n"
66 " {\n"
67 " case 1:\n"
68 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
69 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
70 " break;\n"
71 " case 0:\n"
72 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
73 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
74 " break;\n"
75 " }\n"
76 " f0 = f0 ",
77 operator_symbol,
78 " f1;\n"
79 " switch( parity )\n"
80 " {\n"
81 " case 0:\n"
82 " out[3*i+1] = f0.y; \n"
83 " // fall through\n"
84 " case 1:\n"
85 " out[3*i] = f0.x; \n"
86 " break;\n"
87 " }\n"
88 " }\n"
89 "}\n"
90 };
91
92 const char **kern = c;
93 size_t kernSize = sizeof(c) / sizeof(c[0]);
94
95 if (sizeValues[vectorSize] == 3)
96 {
97 kern = c3;
98 kernSize = sizeof(c3) / sizeof(c3[0]);
99 }
100
101 char testName[32];
102 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
103 sizeNames[vectorSize]);
104
105 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
106 relaxedMode);
107 }
108
109 typedef struct BuildKernelInfo
110 {
111 cl_uint offset; // the first vector size to build
112 cl_uint kernel_count;
113 cl_kernel **kernels;
114 cl_program *programs;
115 const char *operator_symbol;
116 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
117 } BuildKernelInfo;
118
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)119 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
120 {
121 BuildKernelInfo *info = (BuildKernelInfo *)p;
122 cl_uint i = info->offset + job_id;
123 return BuildKernel(info->operator_symbol, i, info->kernel_count,
124 info->kernels[i], info->programs + i, info->relaxedMode);
125 }
126
127 // Thread specific data for a worker thread
128 typedef struct ThreadInfo
129 {
130 cl_mem inBuf; // input buffer for the thread
131 cl_mem inBuf2; // input buffer for the thread
132 cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
133 float maxError; // max error value. Init to 0.
134 double
135 maxErrorValue; // position of the max error value (param 1). Init to 0.
136 double maxErrorValue2; // position of the max error value (param 2). Init
137 // to 0.
138 MTdata d;
139 cl_command_queue tQueue; // per thread command queue to improve performance
140 } ThreadInfo;
141
142 typedef struct TestInfo
143 {
144 size_t subBufferSize; // Size of the sub-buffer in elements
145 const Func *f; // A pointer to the function info
146 cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
147 cl_kernel
148 *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
149 // worker thread: k[vector_size][thread_id]
150 ThreadInfo *
151 tinfo; // An array of thread specific information for each worker thread
152 cl_uint threadCount; // Number of worker threads
153 cl_uint jobCount; // Number of jobs
154 cl_uint step; // step between each chunk and the next.
155 cl_uint scale; // stride between individual test values
156 float ulps; // max_allowed ulps
157 int ftz; // non-zero if running in flush to zero mode
158 bool relaxedMode; // True if the test is being run in relaxed mode, false
159 // otherwise.
160
161 // no special fields
162 } TestInfo;
163
164 // A table of more difficult cases to get right
165 static const float specialValues[] = {
166 -NAN,
167 -INFINITY,
168 -FLT_MAX,
169 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
170 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
171 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
172 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
173 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
174 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
175 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
176 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
177 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
178 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
179 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
180 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
181 -1000.f,
182 -100.f,
183 -4.0f,
184 -3.5f,
185 -3.0f,
186 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
187 -2.5f,
188 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
189 -2.0f,
190 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
191 -1.5f,
192 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
193 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
194 -1.0f,
195 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
196 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
197 -0.5f,
198 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
199 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
200 -0.25f,
201 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
202 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
203 -FLT_MIN,
204 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
205 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
206 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
207 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
208 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
209 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
210 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
211 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
212 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
213 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
214 -0.0f,
215
216 +NAN,
217 +INFINITY,
218 +FLT_MAX,
219 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
220 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
221 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
222 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
223 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
224 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
225 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
226 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
227 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
228 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
229 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
230 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
231 +1000.f,
232 +100.f,
233 +4.0f,
234 +3.5f,
235 +3.0f,
236 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
237 2.5f,
238 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
239 +2.0f,
240 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
241 1.5f,
242 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
243 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
244 +1.0f,
245 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
246 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
247 +0.5f,
248 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
249 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
250 +0.25f,
251 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
252 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
253 +FLT_MIN,
254 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
255 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
256 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
257 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
258 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
259 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
260 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
261 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
262 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
263 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
264 +0.0f,
265 };
266
267 static const size_t specialValuesCount =
268 sizeof(specialValues) / sizeof(specialValues[0]);
269
270 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
271
TestFunc_Float_Float_Float_Operator(const Func * f,MTdata d,bool relaxedMode)272 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
273 bool relaxedMode)
274 {
275 TestInfo test_info;
276 cl_int error;
277 float maxError = 0.0f;
278 double maxErrorVal = 0.0;
279 double maxErrorVal2 = 0.0;
280
281 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
282
283 // Init test_info
284 memset(&test_info, 0, sizeof(test_info));
285 test_info.threadCount = GetThreadCount();
286 test_info.subBufferSize = BUFFER_SIZE
287 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
288 test_info.scale = getTestScale(sizeof(cl_float));
289
290 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
291 if (test_info.step / test_info.subBufferSize != test_info.scale)
292 {
293 // there was overflow
294 test_info.jobCount = 1;
295 }
296 else
297 {
298 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
299 }
300
301 test_info.f = f;
302 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
303 test_info.ftz =
304 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
305 test_info.relaxedMode = relaxedMode;
306
307 // cl_kernels aren't thread safe, so we make one for each vector size for
308 // every thread
309 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
310 {
311 size_t array_size = test_info.threadCount * sizeof(cl_kernel);
312 test_info.k[i] = (cl_kernel *)malloc(array_size);
313 if (NULL == test_info.k[i])
314 {
315 vlog_error("Error: Unable to allocate storage for kernels!\n");
316 error = CL_OUT_OF_HOST_MEMORY;
317 goto exit;
318 }
319 memset(test_info.k[i], 0, array_size);
320 }
321 test_info.tinfo =
322 (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
323 if (NULL == test_info.tinfo)
324 {
325 vlog_error(
326 "Error: Unable to allocate storage for thread specific data.\n");
327 error = CL_OUT_OF_HOST_MEMORY;
328 goto exit;
329 }
330 memset(test_info.tinfo, 0,
331 test_info.threadCount * sizeof(*test_info.tinfo));
332 for (cl_uint i = 0; i < test_info.threadCount; i++)
333 {
334 cl_buffer_region region = {
335 i * test_info.subBufferSize * sizeof(cl_float),
336 test_info.subBufferSize * sizeof(cl_float)
337 };
338 test_info.tinfo[i].inBuf =
339 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
340 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
341 if (error || NULL == test_info.tinfo[i].inBuf)
342 {
343 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
344 "region {%zd, %zd}\n",
345 region.origin, region.size);
346 goto exit;
347 }
348 test_info.tinfo[i].inBuf2 =
349 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
350 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
351 if (error || NULL == test_info.tinfo[i].inBuf2)
352 {
353 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
354 "region {%zd, %zd}\n",
355 region.origin, region.size);
356 goto exit;
357 }
358
359 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
360 {
361 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
362 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
363 ®ion, &error);
364 if (error || NULL == test_info.tinfo[i].outBuf[j])
365 {
366 vlog_error("Error: Unable to create sub-buffer of "
367 "gOutBuffer[%d] for region {%zd, %zd}\n",
368 (int)j, region.origin, region.size);
369 goto exit;
370 }
371 }
372 test_info.tinfo[i].tQueue =
373 clCreateCommandQueue(gContext, gDevice, 0, &error);
374 if (NULL == test_info.tinfo[i].tQueue || error)
375 {
376 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
377 goto exit;
378 }
379
380 test_info.tinfo[i].d = init_genrand(genrand_int32(d));
381 }
382
383 // Init the kernels
384 {
385 BuildKernelInfo build_info = {
386 gMinVectorSizeIndex, test_info.threadCount, test_info.k,
387 test_info.programs, f->nameInCode, relaxedMode
388 };
389 if ((error = ThreadPool_Do(BuildKernelFn,
390 gMaxVectorSizeIndex - gMinVectorSizeIndex,
391 &build_info)))
392 goto exit;
393 }
394
395 // Run the kernels
396 if (!gSkipCorrectnessTesting)
397 {
398 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
399
400 // Accumulate the arithmetic errors
401 for (cl_uint i = 0; i < test_info.threadCount; i++)
402 {
403 if (test_info.tinfo[i].maxError > maxError)
404 {
405 maxError = test_info.tinfo[i].maxError;
406 maxErrorVal = test_info.tinfo[i].maxErrorValue;
407 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
408 }
409 }
410
411 if (error) goto exit;
412
413 if (gWimpyMode)
414 vlog("Wimp pass");
415 else
416 vlog("passed");
417
418 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
419 }
420
421 vlog("\n");
422
423 exit:
424 // Release
425 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
426 {
427 clReleaseProgram(test_info.programs[i]);
428 if (test_info.k[i])
429 {
430 for (cl_uint j = 0; j < test_info.threadCount; j++)
431 clReleaseKernel(test_info.k[i][j]);
432
433 free(test_info.k[i]);
434 }
435 }
436 if (test_info.tinfo)
437 {
438 for (cl_uint i = 0; i < test_info.threadCount; i++)
439 {
440 free_mtdata(test_info.tinfo[i].d);
441 clReleaseMemObject(test_info.tinfo[i].inBuf);
442 clReleaseMemObject(test_info.tinfo[i].inBuf2);
443 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
444 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
445 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
446 }
447
448 free(test_info.tinfo);
449 }
450
451 return error;
452 }
453
Test(cl_uint job_id,cl_uint thread_id,void * data)454 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
455 {
456 const TestInfo *job = (const TestInfo *)data;
457 size_t buffer_elements = job->subBufferSize;
458 size_t buffer_size = buffer_elements * sizeof(cl_float);
459 cl_uint base = job_id * (cl_uint)job->step;
460 ThreadInfo *tinfo = job->tinfo + thread_id;
461 fptr func = job->f->func;
462 int ftz = job->ftz;
463 bool relaxedMode = job->relaxedMode;
464 float ulps = getAllowedUlpError(job->f, relaxedMode);
465 MTdata d = tinfo->d;
466 cl_int error;
467 cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
468 const char *name = job->f->name;
469 cl_uint *t = 0;
470 cl_float *r = 0;
471 cl_float *s = 0;
472 cl_float *s2 = 0;
473 RoundingMode oldRoundMode;
474
475 if (relaxedMode)
476 {
477 func = job->f->rfunc;
478 }
479
480 // start the map of the output arrays
481 cl_event e[VECTOR_SIZE_COUNT];
482 cl_uint *out[VECTOR_SIZE_COUNT];
483 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
484 {
485 out[j] = (cl_uint *)clEnqueueMapBuffer(
486 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
487 buffer_size, 0, NULL, e + j, &error);
488 if (error || NULL == out[j])
489 {
490 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
491 error);
492 return error;
493 }
494 }
495
496 // Get that moving
497 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
498
499 // Init input array
500 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
501 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
502 cl_uint idx = 0;
503 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
504 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
505
506 if (job_id <= (cl_uint)lastSpecialJobIndex)
507 {
508 // Insert special values
509 uint32_t x, y;
510
511 x = (job_id * buffer_elements) % specialValuesCount;
512 y = (job_id * buffer_elements) / specialValuesCount;
513
514 for (; idx < buffer_elements; idx++)
515 {
516 p[idx] = ((cl_uint *)specialValues)[x];
517 p2[idx] = ((cl_uint *)specialValues)[y];
518 ++x;
519 if (x >= specialValuesCount)
520 {
521 x = 0;
522 y++;
523 if (y >= specialValuesCount) break;
524 }
525 if (relaxedMode && strcmp(name, "divide") == 0)
526 {
527 cl_uint pj = p[idx] & 0x7fffffff;
528 cl_uint p2j = p2[idx] & 0x7fffffff;
529 // Replace values outside [2^-62, 2^62] with QNaN
530 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
531 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
532 }
533 }
534 }
535
536 // Init any remaining values.
537 for (; idx < buffer_elements; idx++)
538 {
539 p[idx] = genrand_int32(d);
540 p2[idx] = genrand_int32(d);
541
542 if (relaxedMode && strcmp(name, "divide") == 0)
543 {
544 cl_uint pj = p[idx] & 0x7fffffff;
545 cl_uint p2j = p2[idx] & 0x7fffffff;
546 // Replace values outside [2^-62, 2^62] with QNaN
547 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
548 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
549 }
550 }
551
552 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
553 buffer_size, p, 0, NULL, NULL)))
554 {
555 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
556 goto exit;
557 }
558
559 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
560 buffer_size, p2, 0, NULL, NULL)))
561 {
562 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
563 goto exit;
564 }
565
566 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
567 {
568 // Wait for the map to finish
569 if ((error = clWaitForEvents(1, e + j)))
570 {
571 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
572 goto exit;
573 }
574 if ((error = clReleaseEvent(e[j])))
575 {
576 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
577 goto exit;
578 }
579
580 // Fill the result buffer with garbage, so that old results don't carry
581 // over
582 uint32_t pattern = 0xffffdead;
583 memset_pattern4(out[j], &pattern, buffer_size);
584 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
585 out[j], 0, NULL, NULL)))
586 {
587 vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
588 goto exit;
589 }
590
591 // run the kernel
592 size_t vectorCount =
593 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
594 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
595 // own copy of the cl_kernel
596 cl_program program = job->programs[j];
597
598 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
599 &tinfo->outBuf[j])))
600 {
601 LogBuildError(program);
602 return error;
603 }
604 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
605 &tinfo->inBuf)))
606 {
607 LogBuildError(program);
608 return error;
609 }
610 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
611 &tinfo->inBuf2)))
612 {
613 LogBuildError(program);
614 return error;
615 }
616
617 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
618 &vectorCount, NULL, 0, NULL, NULL)))
619 {
620 vlog_error("FAILED -- could not execute kernel\n");
621 goto exit;
622 }
623 }
624
625 // Get that moving
626 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
627
628 if (gSkipCorrectnessTesting)
629 {
630 free(overflow);
631 return CL_SUCCESS;
632 }
633
634 // Calculate the correctly rounded reference result
635 FPU_mode_type oldMode;
636 memset(&oldMode, 0, sizeof(oldMode));
637 if (ftz) ForceFTZ(&oldMode);
638
639 // Set the rounding mode to match the device
640 oldRoundMode = kRoundToNearestEven;
641 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
642
643 // Calculate the correctly rounded reference result
644 r = (float *)gOut_Ref + thread_id * buffer_elements;
645 s = (float *)gIn + thread_id * buffer_elements;
646 s2 = (float *)gIn2 + thread_id * buffer_elements;
647 if (gInfNanSupport)
648 {
649 for (size_t j = 0; j < buffer_elements; j++)
650 r[j] = (float)func.f_ff(s[j], s2[j]);
651 }
652 else
653 {
654 for (size_t j = 0; j < buffer_elements; j++)
655 {
656 feclearexcept(FE_OVERFLOW);
657 r[j] = (float)func.f_ff(s[j], s2[j]);
658 overflow[j] =
659 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
660 }
661 }
662
663 if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
664
665 if (ftz) RestoreFPState(&oldMode);
666
667 // Read the data back -- no need to wait for the first N-1 buffers but wait
668 // for the last buffer. This is an in order queue.
669 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
670 {
671 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
672 out[j] = (cl_uint *)clEnqueueMapBuffer(
673 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
674 buffer_size, 0, NULL, NULL, &error);
675 if (error || NULL == out[j])
676 {
677 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
678 error);
679 goto exit;
680 }
681 }
682
683 // Verify data
684 t = (cl_uint *)r;
685 for (size_t j = 0; j < buffer_elements; j++)
686 {
687 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
688 {
689 cl_uint *q = out[k];
690
691 // If we aren't getting the correctly rounded result
692 if (t[j] != q[j])
693 {
694 float test = ((float *)q)[j];
695 double correct = func.f_ff(s[j], s2[j]);
696
697 // Per section 10 paragraph 6, accept any result if an input or
698 // output is a infinity or NaN or overflow
699 if (!gInfNanSupport)
700 {
701 // Note: no double rounding here. Reference functions
702 // calculate in single precision.
703 if (overflow[j] || IsFloatInfinity(correct)
704 || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
705 || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
706 || IsFloatNaN(s[j]))
707 continue;
708 }
709
710 // Per section 10 paragraph 6, accept embedded devices always
711 // returning positive 0.0.
712 if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
713 continue;
714
715 float err = Ulp_Error(test, correct);
716 float errB = Ulp_Error(test, (float)correct);
717
718 int fail =
719 ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
720 if (fabsf(errB) < fabsf(err)) err = errB;
721
722 if (fail && ftz)
723 {
724 // retry per section 6.5.3.2
725 if (IsFloatResultSubnormal(correct, ulps))
726 {
727 fail = fail && (test != 0.0f);
728 if (!fail) err = 0.0f;
729 }
730
731 // retry per section 6.5.3.3
732 if (IsFloatSubnormal(s[j]))
733 {
734 double correct2, correct3;
735 float err2, err3;
736
737 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
738
739 correct2 = func.f_ff(0.0, s2[j]);
740 correct3 = func.f_ff(-0.0, s2[j]);
741
742 // Per section 10 paragraph 6, accept any result if an
743 // input or output is a infinity or NaN or overflow
744 if (!gInfNanSupport)
745 {
746 if (fetestexcept(FE_OVERFLOW)) continue;
747
748 // Note: no double rounding here. Reference
749 // functions calculate in single precision.
750 if (IsFloatInfinity(correct2)
751 || IsFloatNaN(correct2)
752 || IsFloatInfinity(correct3)
753 || IsFloatNaN(correct3))
754 continue;
755 }
756
757 err2 = Ulp_Error(test, correct2);
758 err3 = Ulp_Error(test, correct3);
759 fail = fail
760 && ((!(fabsf(err2) <= ulps))
761 && (!(fabsf(err3) <= ulps)));
762 if (fabsf(err2) < fabsf(err)) err = err2;
763 if (fabsf(err3) < fabsf(err)) err = err3;
764
765 // retry per section 6.5.3.4
766 if (IsFloatResultSubnormal(correct2, ulps)
767 || IsFloatResultSubnormal(correct3, ulps))
768 {
769 fail = fail && (test != 0.0f);
770 if (!fail) err = 0.0f;
771 }
772
773 // try with both args as zero
774 if (IsFloatSubnormal(s2[j]))
775 {
776 double correct4, correct5;
777 float err4, err5;
778
779 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
780
781 correct2 = func.f_ff(0.0, 0.0);
782 correct3 = func.f_ff(-0.0, 0.0);
783 correct4 = func.f_ff(0.0, -0.0);
784 correct5 = func.f_ff(-0.0, -0.0);
785
786 // Per section 10 paragraph 6, accept any result if
787 // an input or output is a infinity or NaN or
788 // overflow
789 if (!gInfNanSupport)
790 {
791 if (fetestexcept(FE_OVERFLOW)) continue;
792
793 // Note: no double rounding here. Reference
794 // functions calculate in single precision.
795 if (IsFloatInfinity(correct2)
796 || IsFloatNaN(correct2)
797 || IsFloatInfinity(correct3)
798 || IsFloatNaN(correct3)
799 || IsFloatInfinity(correct4)
800 || IsFloatNaN(correct4)
801 || IsFloatInfinity(correct5)
802 || IsFloatNaN(correct5))
803 continue;
804 }
805
806 err2 = Ulp_Error(test, correct2);
807 err3 = Ulp_Error(test, correct3);
808 err4 = Ulp_Error(test, correct4);
809 err5 = Ulp_Error(test, correct5);
810 fail = fail
811 && ((!(fabsf(err2) <= ulps))
812 && (!(fabsf(err3) <= ulps))
813 && (!(fabsf(err4) <= ulps))
814 && (!(fabsf(err5) <= ulps)));
815 if (fabsf(err2) < fabsf(err)) err = err2;
816 if (fabsf(err3) < fabsf(err)) err = err3;
817 if (fabsf(err4) < fabsf(err)) err = err4;
818 if (fabsf(err5) < fabsf(err)) err = err5;
819
820 // retry per section 6.5.3.4
821 if (IsFloatResultSubnormal(correct2, ulps)
822 || IsFloatResultSubnormal(correct3, ulps)
823 || IsFloatResultSubnormal(correct4, ulps)
824 || IsFloatResultSubnormal(correct5, ulps))
825 {
826 fail = fail && (test != 0.0f);
827 if (!fail) err = 0.0f;
828 }
829 }
830 }
831 else if (IsFloatSubnormal(s2[j]))
832 {
833 double correct2, correct3;
834 float err2, err3;
835
836 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
837
838 correct2 = func.f_ff(s[j], 0.0);
839 correct3 = func.f_ff(s[j], -0.0);
840
841 // Per section 10 paragraph 6, accept any result if an
842 // input or output is a infinity or NaN or overflow
843 if (!gInfNanSupport)
844 {
845 // Note: no double rounding here. Reference
846 // functions calculate in single precision.
847 if (overflow[j] || IsFloatInfinity(correct)
848 || IsFloatNaN(correct)
849 || IsFloatInfinity(correct2)
850 || IsFloatNaN(correct2))
851 continue;
852 }
853
854 err2 = Ulp_Error(test, correct2);
855 err3 = Ulp_Error(test, correct3);
856 fail = fail
857 && ((!(fabsf(err2) <= ulps))
858 && (!(fabsf(err3) <= ulps)));
859 if (fabsf(err2) < fabsf(err)) err = err2;
860 if (fabsf(err3) < fabsf(err)) err = err3;
861
862 // retry per section 6.5.3.4
863 if (IsFloatResultSubnormal(correct2, ulps)
864 || IsFloatResultSubnormal(correct3, ulps))
865 {
866 fail = fail && (test != 0.0f);
867 if (!fail) err = 0.0f;
868 }
869 }
870 }
871
872
873 if (fabsf(err) > tinfo->maxError)
874 {
875 tinfo->maxError = fabsf(err);
876 tinfo->maxErrorValue = s[j];
877 tinfo->maxErrorValue2 = s2[j];
878 }
879 if (fail)
880 {
881 vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
882 "vs. %a (0x%8.8x) at index: %d\n",
883 name, sizeNames[k], err, s[j], s2[j], r[j], test,
884 ((cl_uint *)&test)[0], j);
885 error = -1;
886 goto exit;
887 }
888 }
889 }
890 }
891
892 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
893 {
894 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
895 out[j], 0, NULL, NULL)))
896 {
897 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
898 j, error);
899 return error;
900 }
901 }
902
903 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
904
905
906 if (0 == (base & 0x0fffffff))
907 {
908 if (gVerboseBruteForce)
909 {
910 vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
911 "ThreadCount:%2u\n",
912 base, job->step, job->scale, buffer_elements, job->ulps,
913 job->threadCount);
914 }
915 else
916 {
917 vlog(".");
918 }
919 fflush(stdout);
920 }
921
922 exit:
923 if (overflow) free(overflow);
924 return error;
925 }
926