1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
24 cl_kernel *k, cl_program *p, bool relaxedMode)
25 {
26 const char *c[] = { "__kernel void math_kernel",
27 sizeNames[vectorSize],
28 "( __global int",
29 sizeNames[vectorSize],
30 "* out, __global float",
31 sizeNames[vectorSize],
32 "* in1, __global float",
33 sizeNames[vectorSize],
34 "* in2 )\n"
35 "{\n"
36 " size_t i = get_global_id(0);\n"
37 " out[i] = ",
38 name,
39 "( in1[i], in2[i] );\n"
40 "}\n" };
41
42 const char *c3[] = {
43 "__kernel void math_kernel",
44 sizeNames[vectorSize],
45 "( __global int* out, __global float* in, __global float* in2)\n"
46 "{\n"
47 " size_t i = get_global_id(0);\n"
48 " if( i + 1 < get_global_size(0) )\n"
49 " {\n"
50 " float3 f0 = vload3( 0, in + 3 * i );\n"
51 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
52 " int3 i0 = ",
53 name,
54 "( f0, f1 );\n"
55 " vstore3( i0, 0, out + 3*i );\n"
56 " }\n"
57 " else\n"
58 " {\n"
59 " size_t parity = i & 1; // Figure out how many elements are "
60 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
61 "buffer size \n"
62 " float3 f0;\n"
63 " float3 f1;\n"
64 " switch( parity )\n"
65 " {\n"
66 " case 1:\n"
67 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
68 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
69 " break;\n"
70 " case 0:\n"
71 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
72 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
73 " break;\n"
74 " }\n"
75 " int3 i0 = ",
76 name,
77 "( f0, f1 );\n"
78 " switch( parity )\n"
79 " {\n"
80 " case 0:\n"
81 " out[3*i+1] = i0.y; \n"
82 " // fall through\n"
83 " case 1:\n"
84 " out[3*i] = i0.x; \n"
85 " break;\n"
86 " }\n"
87 " }\n"
88 "}\n"
89 };
90
91 const char **kern = c;
92 size_t kernSize = sizeof(c) / sizeof(c[0]);
93
94 if (sizeValues[vectorSize] == 3)
95 {
96 kern = c3;
97 kernSize = sizeof(c3) / sizeof(c3[0]);
98 }
99
100 char testName[32];
101 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
102 sizeNames[vectorSize]);
103
104 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
105 relaxedMode);
106 }
107
108 typedef struct BuildKernelInfo
109 {
110 cl_uint offset; // the first vector size to build
111 cl_uint kernel_count;
112 cl_kernel **kernels;
113 cl_program *programs;
114 const char *nameInCode;
115 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
116 } BuildKernelInfo;
117
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)118 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
119 {
120 BuildKernelInfo *info = (BuildKernelInfo *)p;
121 cl_uint i = info->offset + job_id;
122 return BuildKernel(info->nameInCode, i, info->kernel_count,
123 info->kernels[i], info->programs + i, info->relaxedMode);
124 }
125
126 // Thread specific data for a worker thread
127 typedef struct ThreadInfo
128 {
129 cl_mem inBuf; // input buffer for the thread
130 cl_mem inBuf2; // input buffer for the thread
131 cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
132 MTdata d;
133 cl_command_queue tQueue; // per thread command queue to improve performance
134 } ThreadInfo;
135
136 typedef struct TestInfo
137 {
138 size_t subBufferSize; // Size of the sub-buffer in elements
139 const Func *f; // A pointer to the function info
140 cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
141 cl_kernel
142 *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
143 // worker thread: k[vector_size][thread_id]
144 ThreadInfo *
145 tinfo; // An array of thread specific information for each worker thread
146 cl_uint threadCount; // Number of worker threads
147 cl_uint jobCount; // Number of jobs
148 cl_uint step; // step between each chunk and the next.
149 cl_uint scale; // stride between individual test values
150 int ftz; // non-zero if running in flush to zero mode
151
152 } TestInfo;
153
154 // A table of more difficult cases to get right
155 static const float specialValues[] = {
156 -NAN,
157 -INFINITY,
158 -FLT_MAX,
159 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
160 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
161 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
162 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
163 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
164 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
165 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
166 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
167 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
168 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
169 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
170 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
171 -1000.f,
172 -100.f,
173 -4.0f,
174 -3.5f,
175 -3.0f,
176 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
177 -2.5f,
178 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
179 -2.0f,
180 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
181 -1.5f,
182 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
183 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
184 -1.0f,
185 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
186 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
187 -0.5f,
188 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
189 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
190 -0.25f,
191 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
192 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
193 -FLT_MIN,
194 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
195 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
196 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
197 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
198 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
199 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
200 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
201 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
202 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
203 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
204 -0.0f,
205
206 +NAN,
207 +INFINITY,
208 +FLT_MAX,
209 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
210 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
211 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
212 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
213 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
214 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
215 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
216 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
217 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
218 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
219 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
220 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
221 +1000.f,
222 +100.f,
223 +4.0f,
224 +3.5f,
225 +3.0f,
226 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
227 2.5f,
228 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
229 +2.0f,
230 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
231 1.5f,
232 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
233 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
234 +1.0f,
235 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
236 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
237 +0.5f,
238 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
239 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
240 +0.25f,
241 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
242 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
243 +FLT_MIN,
244 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
245 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
246 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
247 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
248 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
249 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
250 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
251 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
252 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
253 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
254 +0.0f,
255 };
256
257 static const size_t specialValuesCount =
258 sizeof(specialValues) / sizeof(specialValues[0]);
259
260 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
261
TestMacro_Int_Float_Float(const Func * f,MTdata d,bool relaxedMode)262 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
263 {
264 TestInfo test_info;
265 cl_int error;
266
267 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
268
269 // Init test_info
270 memset(&test_info, 0, sizeof(test_info));
271 test_info.threadCount = GetThreadCount();
272 test_info.subBufferSize = BUFFER_SIZE
273 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
274 test_info.scale = getTestScale(sizeof(cl_float));
275
276 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
277 if (test_info.step / test_info.subBufferSize != test_info.scale)
278 {
279 // there was overflow
280 test_info.jobCount = 1;
281 }
282 else
283 {
284 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
285 }
286
287 test_info.f = f;
288 test_info.ftz =
289 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
290
291 // cl_kernels aren't thread safe, so we make one for each vector size for
292 // every thread
293 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
294 {
295 size_t array_size = test_info.threadCount * sizeof(cl_kernel);
296 test_info.k[i] = (cl_kernel *)malloc(array_size);
297 if (NULL == test_info.k[i])
298 {
299 vlog_error("Error: Unable to allocate storage for kernels!\n");
300 error = CL_OUT_OF_HOST_MEMORY;
301 goto exit;
302 }
303 memset(test_info.k[i], 0, array_size);
304 }
305 test_info.tinfo =
306 (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
307 if (NULL == test_info.tinfo)
308 {
309 vlog_error(
310 "Error: Unable to allocate storage for thread specific data.\n");
311 error = CL_OUT_OF_HOST_MEMORY;
312 goto exit;
313 }
314 memset(test_info.tinfo, 0,
315 test_info.threadCount * sizeof(*test_info.tinfo));
316 for (cl_uint i = 0; i < test_info.threadCount; i++)
317 {
318 cl_buffer_region region = {
319 i * test_info.subBufferSize * sizeof(cl_float),
320 test_info.subBufferSize * sizeof(cl_float)
321 };
322 test_info.tinfo[i].inBuf =
323 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
324 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
325 if (error || NULL == test_info.tinfo[i].inBuf)
326 {
327 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
328 "region {%zd, %zd}\n",
329 region.origin, region.size);
330 goto exit;
331 }
332 test_info.tinfo[i].inBuf2 =
333 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
334 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
335 if (error || NULL == test_info.tinfo[i].inBuf2)
336 {
337 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
338 "region {%zd, %zd}\n",
339 region.origin, region.size);
340 goto exit;
341 }
342
343 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
344 {
345 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
346 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
347 ®ion, &error);
348 if (error || NULL == test_info.tinfo[i].outBuf[j])
349 {
350 vlog_error("Error: Unable to create sub-buffer of "
351 "gOutBuffer[%d] for region {%zd, %zd}\n",
352 (int)j, region.origin, region.size);
353 goto exit;
354 }
355 }
356 test_info.tinfo[i].tQueue =
357 clCreateCommandQueue(gContext, gDevice, 0, &error);
358 if (NULL == test_info.tinfo[i].tQueue || error)
359 {
360 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
361 goto exit;
362 }
363
364 test_info.tinfo[i].d = init_genrand(genrand_int32(d));
365 }
366
367 // Init the kernels
368 {
369 BuildKernelInfo build_info = {
370 gMinVectorSizeIndex, test_info.threadCount, test_info.k,
371 test_info.programs, f->nameInCode, relaxedMode
372 };
373 if ((error = ThreadPool_Do(BuildKernelFn,
374 gMaxVectorSizeIndex - gMinVectorSizeIndex,
375 &build_info)))
376 goto exit;
377 }
378
379 // Run the kernels
380 if (!gSkipCorrectnessTesting)
381 {
382 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
383
384 if (error) goto exit;
385
386 if (gWimpyMode)
387 vlog("Wimp pass");
388 else
389 vlog("passed");
390 }
391
392 vlog("\n");
393
394 exit:
395 // Release
396 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
397 {
398 clReleaseProgram(test_info.programs[i]);
399 if (test_info.k[i])
400 {
401 for (cl_uint j = 0; j < test_info.threadCount; j++)
402 clReleaseKernel(test_info.k[i][j]);
403
404 free(test_info.k[i]);
405 }
406 }
407 if (test_info.tinfo)
408 {
409 for (cl_uint i = 0; i < test_info.threadCount; i++)
410 {
411 free_mtdata(test_info.tinfo[i].d);
412 clReleaseMemObject(test_info.tinfo[i].inBuf);
413 clReleaseMemObject(test_info.tinfo[i].inBuf2);
414 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
415 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
416 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
417 }
418
419 free(test_info.tinfo);
420 }
421
422 return error;
423 }
424
Test(cl_uint job_id,cl_uint thread_id,void * data)425 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
426 {
427 const TestInfo *job = (const TestInfo *)data;
428 size_t buffer_elements = job->subBufferSize;
429 size_t buffer_size = buffer_elements * sizeof(cl_float);
430 cl_uint base = job_id * (cl_uint)job->step;
431 ThreadInfo *tinfo = job->tinfo + thread_id;
432 fptr func = job->f->func;
433 int ftz = job->ftz;
434 MTdata d = tinfo->d;
435 cl_int error;
436 const char *name = job->f->name;
437 cl_int *t = 0;
438 cl_int *r = 0;
439 cl_float *s = 0;
440 cl_float *s2 = 0;
441
442 // start the map of the output arrays
443 cl_event e[VECTOR_SIZE_COUNT];
444 cl_int *out[VECTOR_SIZE_COUNT];
445 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
446 {
447 out[j] = (cl_int *)clEnqueueMapBuffer(
448 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
449 buffer_size, 0, NULL, e + j, &error);
450 if (error || NULL == out[j])
451 {
452 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
453 error);
454 return error;
455 }
456 }
457
458 // Get that moving
459 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
460
461 // Init input array
462 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
463 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
464 cl_uint idx = 0;
465
466 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
467 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
468
469 if (job_id <= (cl_uint)lastSpecialJobIndex)
470 { // test edge cases
471 float *fp = (float *)p;
472 float *fp2 = (float *)p2;
473 uint32_t x, y;
474
475 x = (job_id * buffer_elements) % specialValuesCount;
476 y = (job_id * buffer_elements) / specialValuesCount;
477
478 for (; idx < buffer_elements; idx++)
479 {
480 fp[idx] = specialValues[x];
481 fp2[idx] = specialValues[y];
482 ++x;
483 if (x >= specialValuesCount)
484 {
485 x = 0;
486 y++;
487 if (y >= specialValuesCount) break;
488 }
489 }
490 }
491
492 // Init any remaining values.
493 for (; idx < buffer_elements; idx++)
494 {
495 p[idx] = genrand_int32(d);
496 p2[idx] = genrand_int32(d);
497 }
498
499 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
500 buffer_size, p, 0, NULL, NULL)))
501 {
502 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
503 goto exit;
504 }
505
506 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
507 buffer_size, p2, 0, NULL, NULL)))
508 {
509 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
510 goto exit;
511 }
512
513 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
514 {
515 // Wait for the map to finish
516 if ((error = clWaitForEvents(1, e + j)))
517 {
518 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
519 goto exit;
520 }
521 if ((error = clReleaseEvent(e[j])))
522 {
523 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
524 goto exit;
525 }
526
527 // Fill the result buffer with garbage, so that old results don't carry
528 // over
529 uint32_t pattern = 0xffffdead;
530 memset_pattern4(out[j], &pattern, buffer_size);
531 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
532 out[j], 0, NULL, NULL)))
533 {
534 vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
535 goto exit;
536 }
537
538 // run the kernel
539 size_t vectorCount =
540 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
541 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
542 // own copy of the cl_kernel
543 cl_program program = job->programs[j];
544
545 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
546 &tinfo->outBuf[j])))
547 {
548 LogBuildError(program);
549 return error;
550 }
551 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
552 &tinfo->inBuf)))
553 {
554 LogBuildError(program);
555 return error;
556 }
557 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
558 &tinfo->inBuf2)))
559 {
560 LogBuildError(program);
561 return error;
562 }
563
564 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
565 &vectorCount, NULL, 0, NULL, NULL)))
566 {
567 vlog_error("FAILED -- could not execute kernel\n");
568 goto exit;
569 }
570 }
571
572 // Get that moving
573 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
574
575 if (gSkipCorrectnessTesting) return CL_SUCCESS;
576
577 // Calculate the correctly rounded reference result
578 r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
579 s = (float *)gIn + thread_id * buffer_elements;
580 s2 = (float *)gIn2 + thread_id * buffer_elements;
581 for (size_t j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
582
583 // Read the data back -- no need to wait for the first N-1 buffers but wait
584 // for the last buffer. This is an in order queue.
585 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
586 {
587 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
588 out[j] = (cl_int *)clEnqueueMapBuffer(
589 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
590 buffer_size, 0, NULL, NULL, &error);
591 if (error || NULL == out[j])
592 {
593 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
594 error);
595 goto exit;
596 }
597 }
598
599 // Verify data
600 t = (cl_int *)r;
601 for (size_t j = 0; j < buffer_elements; j++)
602 {
603 cl_int *q = out[0];
604
605 if (gMinVectorSizeIndex == 0 && t[j] != q[j])
606 {
607 if (ftz)
608 {
609 if (IsFloatSubnormal(s[j]))
610 {
611 if (IsFloatSubnormal(s2[j]))
612 {
613 int correct = func.i_ff(0.0f, 0.0f);
614 int correct2 = func.i_ff(0.0f, -0.0f);
615 int correct3 = func.i_ff(-0.0f, 0.0f);
616 int correct4 = func.i_ff(-0.0f, -0.0f);
617
618 if (correct == q[j] || correct2 == q[j]
619 || correct3 == q[j] || correct4 == q[j])
620 continue;
621 }
622 else
623 {
624 int correct = func.i_ff(0.0f, s2[j]);
625 int correct2 = func.i_ff(-0.0f, s2[j]);
626 if (correct == q[j] || correct2 == q[j]) continue;
627 }
628 }
629 else if (IsFloatSubnormal(s2[j]))
630 {
631 int correct = func.i_ff(s[j], 0.0f);
632 int correct2 = func.i_ff(s[j], -0.0f);
633 if (correct == q[j] || correct2 == q[j]) continue;
634 }
635 }
636
637 uint32_t err = t[j] - q[j];
638 if (q[j] > t[j]) err = q[j] - t[j];
639 vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
640 "0x%8.8x (index: %d)\n",
641 name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
642 j);
643 error = -1;
644 goto exit;
645 }
646
647 for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
648 {
649 q = out[k];
650 // If we aren't getting the correctly rounded result
651 if (-t[j] != q[j])
652 {
653 if (ftz)
654 {
655 if (IsFloatSubnormal(s[j]))
656 {
657 if (IsFloatSubnormal(s2[j]))
658 {
659 int correct = -func.i_ff(0.0f, 0.0f);
660 int correct2 = -func.i_ff(0.0f, -0.0f);
661 int correct3 = -func.i_ff(-0.0f, 0.0f);
662 int correct4 = -func.i_ff(-0.0f, -0.0f);
663
664 if (correct == q[j] || correct2 == q[j]
665 || correct3 == q[j] || correct4 == q[j])
666 continue;
667 }
668 else
669 {
670 int correct = -func.i_ff(0.0f, s2[j]);
671 int correct2 = -func.i_ff(-0.0f, s2[j]);
672 if (correct == q[j] || correct2 == q[j]) continue;
673 }
674 }
675 else if (IsFloatSubnormal(s2[j]))
676 {
677 int correct = -func.i_ff(s[j], 0.0f);
678 int correct2 = -func.i_ff(s[j], -0.0f);
679 if (correct == q[j] || correct2 == q[j]) continue;
680 }
681 }
682 cl_uint err = -t[j] - q[j];
683 if (q[j] > -t[j]) err = q[j] + t[j];
684 vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
685 "vs. 0x%8.8x (index: %d)\n",
686 name, sizeNames[k], err, ((float *)s)[j],
687 ((float *)s2)[j], -t[j], q[j], j);
688 error = -1;
689 goto exit;
690 }
691 }
692 }
693
694 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
695 {
696 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
697 out[j], 0, NULL, NULL)))
698 {
699 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
700 j, error);
701 return error;
702 }
703 }
704
705 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
706
707
708 if (0 == (base & 0x0fffffff))
709 {
710 if (gVerboseBruteForce)
711 {
712 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
713 "ThreadCount:%2u\n",
714 base, job->step, job->scale, buffer_elements,
715 job->threadCount);
716 }
717 else
718 {
719 vlog(".");
720 }
721 fflush(stdout);
722 }
723
724 exit:
725 return error;
726 }
727