1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "function_list.h"
18 #include "test_functions.h"
19 #include "utility.h"
20
21 #include <cstring>
22
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)23 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
24 cl_kernel *k, cl_program *p, bool relaxedMode)
25 {
26 const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
27 "__kernel void math_kernel",
28 sizeNames[vectorSize],
29 "( __global double",
30 sizeNames[vectorSize],
31 "* out, __global double",
32 sizeNames[vectorSize],
33 "* in )\n"
34 "{\n"
35 " size_t i = get_global_id(0);\n"
36 " out[i] = ",
37 name,
38 "( in[i] );\n"
39 "}\n" };
40
41 const char *c3[] = {
42 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
43 "__kernel void math_kernel",
44 sizeNames[vectorSize],
45 "( __global double* out, __global double* in)\n"
46 "{\n"
47 " size_t i = get_global_id(0);\n"
48 " if( i + 1 < get_global_size(0) )\n"
49 " {\n"
50 " double3 f0 = vload3( 0, in + 3 * i );\n"
51 " f0 = ",
52 name,
53 "( f0 );\n"
54 " vstore3( f0, 0, out + 3*i );\n"
55 " }\n"
56 " else\n"
57 " {\n"
58 " size_t parity = i & 1; // Figure out how many elements are "
59 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
60 "buffer size \n"
61 " double3 f0;\n"
62 " switch( parity )\n"
63 " {\n"
64 " case 1:\n"
65 " f0 = (double3)( in[3*i], NAN, NAN ); \n"
66 " break;\n"
67 " case 0:\n"
68 " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
69 " break;\n"
70 " }\n"
71 " f0 = ",
72 name,
73 "( f0 );\n"
74 " switch( parity )\n"
75 " {\n"
76 " case 0:\n"
77 " out[3*i+1] = f0.y; \n"
78 " // fall through\n"
79 " case 1:\n"
80 " out[3*i] = f0.x; \n"
81 " break;\n"
82 " }\n"
83 " }\n"
84 "}\n"
85 };
86
87 const char **kern = c;
88 size_t kernSize = sizeof(c) / sizeof(c[0]);
89
90 if (sizeValues[vectorSize] == 3)
91 {
92 kern = c3;
93 kernSize = sizeof(c3) / sizeof(c3[0]);
94 }
95
96 char testName[32];
97 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
98 sizeNames[vectorSize]);
99
100 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
101 relaxedMode);
102 }
103
104 typedef struct BuildKernelInfo
105 {
106 cl_uint offset; // the first vector size to build
107 cl_uint kernel_count;
108 cl_kernel **kernels;
109 cl_program *programs;
110 const char *nameInCode;
111 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
112 } BuildKernelInfo;
113
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)114 static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
115 {
116 BuildKernelInfo *info = (BuildKernelInfo *)p;
117 cl_uint i = info->offset + job_id;
118 return BuildKernel(info->nameInCode, i, info->kernel_count,
119 info->kernels[i], info->programs + i, info->relaxedMode);
120 }
121
122 // Thread specific data for a worker thread
123 typedef struct ThreadInfo
124 {
125 cl_mem inBuf; // input buffer for the thread
126 cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
127 float maxError; // max error value. Init to 0.
128 double maxErrorValue; // position of the max error value. Init to 0.
129 cl_command_queue tQueue; // per thread command queue to improve performance
130 } ThreadInfo;
131
132 typedef struct TestInfo
133 {
134 size_t subBufferSize; // Size of the sub-buffer in elements
135 const Func *f; // A pointer to the function info
136 cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
137 cl_kernel
138 *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
139 // worker thread: k[vector_size][thread_id]
140 ThreadInfo *
141 tinfo; // An array of thread specific information for each worker thread
142 cl_uint threadCount; // Number of worker threads
143 cl_uint jobCount; // Number of jobs
144 cl_uint step; // step between each chunk and the next.
145 cl_uint scale; // stride between individual test values
146 float ulps; // max_allowed ulps
147 int ftz; // non-zero if running in flush to zero mode
148
149 int isRangeLimited; // 1 if the function is only to be evaluated over a
150 // range
151 float half_sin_cos_tan_limit;
152 bool relaxedMode; // True if test is running in relaxed mode, false
153 // otherwise.
154 } TestInfo;
155
156 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
157
TestFunc_Double_Double(const Func * f,MTdata d,bool relaxedMode)158 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
159 {
160 TestInfo test_info;
161 cl_int error;
162 float maxError = 0.0f;
163 double maxErrorVal = 0.0;
164
165 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
166 // Init test_info
167 memset(&test_info, 0, sizeof(test_info));
168 test_info.threadCount = GetThreadCount();
169 test_info.subBufferSize = BUFFER_SIZE
170 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
171 test_info.scale = getTestScale(sizeof(cl_double));
172
173 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
174 if (test_info.step / test_info.subBufferSize != test_info.scale)
175 {
176 // there was overflow
177 test_info.jobCount = 1;
178 }
179 else
180 {
181 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
182 }
183
184 test_info.f = f;
185 test_info.ulps = f->double_ulps;
186 test_info.ftz = f->ftz || gForceFTZ;
187 test_info.relaxedMode = relaxedMode;
188
189 // cl_kernels aren't thread safe, so we make one for each vector size for
190 // every thread
191 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
192 {
193 size_t array_size = test_info.threadCount * sizeof(cl_kernel);
194 test_info.k[i] = (cl_kernel *)malloc(array_size);
195 if (NULL == test_info.k[i])
196 {
197 vlog_error("Error: Unable to allocate storage for kernels!\n");
198 error = CL_OUT_OF_HOST_MEMORY;
199 goto exit;
200 }
201 memset(test_info.k[i], 0, array_size);
202 }
203 test_info.tinfo =
204 (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
205 if (NULL == test_info.tinfo)
206 {
207 vlog_error(
208 "Error: Unable to allocate storage for thread specific data.\n");
209 error = CL_OUT_OF_HOST_MEMORY;
210 goto exit;
211 }
212 memset(test_info.tinfo, 0,
213 test_info.threadCount * sizeof(*test_info.tinfo));
214 for (cl_uint i = 0; i < test_info.threadCount; i++)
215 {
216 cl_buffer_region region = {
217 i * test_info.subBufferSize * sizeof(cl_double),
218 test_info.subBufferSize * sizeof(cl_double)
219 };
220 test_info.tinfo[i].inBuf =
221 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
222 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
223 if (error || NULL == test_info.tinfo[i].inBuf)
224 {
225 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
226 "region {%zd, %zd}\n",
227 region.origin, region.size);
228 goto exit;
229 }
230
231 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
232 {
233 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
234 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
235 ®ion, &error);
236 if (error || NULL == test_info.tinfo[i].outBuf[j])
237 {
238 vlog_error("Error: Unable to create sub-buffer of "
239 "gOutBuffer[%d] for region {%zd, %zd}\n",
240 (int)j, region.origin, region.size);
241 goto exit;
242 }
243 }
244 test_info.tinfo[i].tQueue =
245 clCreateCommandQueue(gContext, gDevice, 0, &error);
246 if (NULL == test_info.tinfo[i].tQueue || error)
247 {
248 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
249 goto exit;
250 }
251 }
252
253 // Init the kernels
254 {
255 BuildKernelInfo build_info = {
256 gMinVectorSizeIndex, test_info.threadCount, test_info.k,
257 test_info.programs, f->nameInCode, relaxedMode
258 };
259 if ((error = ThreadPool_Do(BuildKernelFn,
260 gMaxVectorSizeIndex - gMinVectorSizeIndex,
261 &build_info)))
262 goto exit;
263 }
264
265 // Run the kernels
266 if (!gSkipCorrectnessTesting)
267 {
268 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
269
270 // Accumulate the arithmetic errors
271 for (cl_uint i = 0; i < test_info.threadCount; i++)
272 {
273 if (test_info.tinfo[i].maxError > maxError)
274 {
275 maxError = test_info.tinfo[i].maxError;
276 maxErrorVal = test_info.tinfo[i].maxErrorValue;
277 }
278 }
279
280 if (error) goto exit;
281
282 if (gWimpyMode)
283 vlog("Wimp pass");
284 else
285 vlog("passed");
286
287 vlog("\t%8.2f @ %a", maxError, maxErrorVal);
288 }
289
290 vlog("\n");
291
292 exit:
293 // Release
294 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
295 {
296 clReleaseProgram(test_info.programs[i]);
297 if (test_info.k[i])
298 {
299 for (cl_uint j = 0; j < test_info.threadCount; j++)
300 clReleaseKernel(test_info.k[i][j]);
301
302 free(test_info.k[i]);
303 }
304 }
305 if (test_info.tinfo)
306 {
307 for (cl_uint i = 0; i < test_info.threadCount; i++)
308 {
309 clReleaseMemObject(test_info.tinfo[i].inBuf);
310 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
311 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
312 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
313 }
314
315 free(test_info.tinfo);
316 }
317
318 return error;
319 }
320
Test(cl_uint job_id,cl_uint thread_id,void * data)321 static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
322 {
323 const TestInfo *job = (const TestInfo *)data;
324 size_t buffer_elements = job->subBufferSize;
325 size_t buffer_size = buffer_elements * sizeof(cl_double);
326 cl_uint scale = job->scale;
327 cl_uint base = job_id * (cl_uint)job->step;
328 ThreadInfo *tinfo = job->tinfo + thread_id;
329 float ulps = job->ulps;
330 dptr func = job->f->dfunc;
331 cl_int error;
332 int ftz = job->ftz;
333
334 Force64BitFPUPrecision();
335
336 // start the map of the output arrays
337 cl_event e[VECTOR_SIZE_COUNT];
338 cl_ulong *out[VECTOR_SIZE_COUNT];
339 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
340 {
341 out[j] = (cl_ulong *)clEnqueueMapBuffer(
342 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
343 buffer_size, 0, NULL, e + j, &error);
344 if (error || NULL == out[j])
345 {
346 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
347 error);
348 return error;
349 }
350 }
351
352 // Get that moving
353 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
354
355 // Write the new values to the input array
356 cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
357 for (size_t j = 0; j < buffer_elements; j++)
358 p[j] = DoubleFromUInt32(base + j * scale);
359
360 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
361 buffer_size, p, 0, NULL, NULL)))
362 {
363 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
364 return error;
365 }
366
367 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
368 {
369 // Wait for the map to finish
370 if ((error = clWaitForEvents(1, e + j)))
371 {
372 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
373 return error;
374 }
375 if ((error = clReleaseEvent(e[j])))
376 {
377 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
378 return error;
379 }
380
381 // Fill the result buffer with garbage, so that old results don't carry
382 // over
383 uint32_t pattern = 0xffffdead;
384 memset_pattern4(out[j], &pattern, buffer_size);
385 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
386 out[j], 0, NULL, NULL)))
387 {
388 vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
389 return error;
390 }
391
392 // run the kernel
393 size_t vectorCount =
394 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
395 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
396 // own copy of the cl_kernel
397 cl_program program = job->programs[j];
398
399 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
400 &tinfo->outBuf[j])))
401 {
402 LogBuildError(program);
403 return error;
404 }
405 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
406 &tinfo->inBuf)))
407 {
408 LogBuildError(program);
409 return error;
410 }
411
412 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
413 &vectorCount, NULL, 0, NULL, NULL)))
414 {
415 vlog_error("FAILED -- could not execute kernel\n");
416 return error;
417 }
418 }
419
420
421 // Get that moving
422 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
423
424 if (gSkipCorrectnessTesting) return CL_SUCCESS;
425
426 // Calculate the correctly rounded reference result
427 cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
428 cl_double *s = (cl_double *)p;
429 for (size_t j = 0; j < buffer_elements; j++)
430 r[j] = (cl_double)func.f_f(s[j]);
431
432 // Read the data back -- no need to wait for the first N-1 buffers but wait
433 // for the last buffer. This is an in order queue.
434 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
435 {
436 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
437 out[j] = (cl_ulong *)clEnqueueMapBuffer(
438 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
439 buffer_size, 0, NULL, NULL, &error);
440 if (error || NULL == out[j])
441 {
442 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
443 error);
444 return error;
445 }
446 }
447
448 // Verify data
449 cl_ulong *t = (cl_ulong *)r;
450 for (size_t j = 0; j < buffer_elements; j++)
451 {
452 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
453 {
454 cl_ulong *q = out[k];
455
456 // If we aren't getting the correctly rounded result
457 if (t[j] != q[j])
458 {
459 cl_double test = ((cl_double *)q)[j];
460 long double correct = func.f_f(s[j]);
461 float err = Bruteforce_Ulp_Error_Double(test, correct);
462 int fail = !(fabsf(err) <= ulps);
463
464 if (fail)
465 {
466 if (ftz)
467 {
468 // retry per section 6.5.3.2
469 if (IsDoubleResultSubnormal(correct, ulps))
470 {
471 fail = fail && (test != 0.0f);
472 if (!fail) err = 0.0f;
473 }
474
475 // retry per section 6.5.3.3
476 if (IsDoubleSubnormal(s[j]))
477 {
478 long double correct2 = func.f_f(0.0L);
479 long double correct3 = func.f_f(-0.0L);
480 float err2 =
481 Bruteforce_Ulp_Error_Double(test, correct2);
482 float err3 =
483 Bruteforce_Ulp_Error_Double(test, correct3);
484 fail = fail
485 && ((!(fabsf(err2) <= ulps))
486 && (!(fabsf(err3) <= ulps)));
487 if (fabsf(err2) < fabsf(err)) err = err2;
488 if (fabsf(err3) < fabsf(err)) err = err3;
489
490 // retry per section 6.5.3.4
491 if (IsDoubleResultSubnormal(correct2, ulps)
492 || IsDoubleResultSubnormal(correct3, ulps))
493 {
494 fail = fail && (test != 0.0f);
495 if (!fail) err = 0.0f;
496 }
497 }
498 }
499 }
500 if (fabsf(err) > tinfo->maxError)
501 {
502 tinfo->maxError = fabsf(err);
503 tinfo->maxErrorValue = s[j];
504 }
505 if (fail)
506 {
507 vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
508 "(0x%16.16llx): *%.13la vs. %.13la\n",
509 job->f->name, sizeNames[k], err,
510 ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
511 ((cl_double *)gOut_Ref)[j], test);
512 return -1;
513 }
514 }
515 }
516 }
517
518 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
519 {
520 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
521 out[j], 0, NULL, NULL)))
522 {
523 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
524 j, error);
525 return error;
526 }
527 }
528
529 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
530
531
532 if (0 == (base & 0x0fffffff))
533 {
534 if (gVerboseBruteForce)
535 {
536 vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
537 "ThreadCount:%2u\n",
538 base, job->step, buffer_elements, job->scale, job->ulps,
539 job->threadCount);
540 }
541 else
542 {
543 vlog(".");
544 }
545 fflush(stdout);
546 }
547
548 return CL_SUCCESS;
549 }
550