1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include "testBase.h"
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/conversions.h"
22 #include "harness/errorHelpers.h"
23 #include <float.h>
24 
25 const char *crossKernelSource =
26 "__kernel void sample_test(__global float4 *sourceA, __global float4 *sourceB, __global float4 *destValues)\n"
27 "{\n"
28 "    int  tid = get_global_id(0);\n"
29 "    destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
30 "\n"
31 "}\n" ;
32 
33 const char *crossKernelSourceV3 =
34 "__kernel void sample_test(__global float *sourceA, __global float *sourceB, __global float *destValues)\n"
35 "{\n"
36 "    int  tid = get_global_id(0);\n"
37 "    vstore3( cross( vload3( tid, sourceA), vload3( tid,  sourceB) ), tid, destValues );\n"
38 "\n"
39 "}\n";
40 
41 const char *twoToFloatKernelPattern =
42 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
43 "{\n"
44 "    int  tid = get_global_id(0);\n"
45 "    destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
46 "\n"
47 "}\n";
48 
49 const char *twoToFloatKernelPatternV3 =
50 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
51 "{\n"
52 "    int  tid = get_global_id(0);\n"
53 "    destValues[tid] = %s( vload3( tid, (__global float*) sourceA), vload3( tid, (__global float*) sourceB) );\n"
54 "\n"
55 "}\n";
56 
57 const char *oneToFloatKernelPattern =
58 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
59 "{\n"
60 "    int  tid = get_global_id(0);\n"
61 "    destValues[tid] = %s( sourceA[tid] );\n"
62 "\n"
63 "}\n";
64 
65 const char *oneToFloatKernelPatternV3 =
66 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
67 "{\n"
68 "    int  tid = get_global_id(0);\n"
69 "    destValues[tid] = %s( vload3( tid, (__global float*) sourceA) );\n"
70 "\n"
71 "}\n";
72 
73 const char *oneToOneKernelPattern =
74 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
75 "{\n"
76 "    int  tid = get_global_id(0);\n"
77 "    destValues[tid] = %s( sourceA[tid] );\n"
78 "\n"
79 "}\n";
80 
81 const char *oneToOneKernelPatternV3 =
82 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
83 "{\n"
84 "    int  tid = get_global_id(0);\n"
85 "    vstore3( %s( vload3( tid, (__global float*) sourceA) ), tid, (__global float*) destValues );\n"
86 "\n"
87 "}\n";
88 
89 #define TEST_SIZE (1 << 20)
90 
91 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize );
92 double verifyFastLength( float *srcA, size_t vecSize );
93 
94 
95 
vector2string(char * string,float * vector,size_t elements)96 void vector2string( char *string, float *vector, size_t elements )
97 {
98     *string++ = '{';
99     *string++ = ' ';
100     string += sprintf( string, "%a", vector[0] );
101     size_t i;
102     for( i = 1; i < elements; i++ )
103         string += sprintf( string, ", %a", vector[i] );
104     *string++ = ' ';
105     *string++ = '}';
106     *string = '\0';
107 }
108 
fillWithTrickyNumbers(float * aVectors,float * bVectors,size_t vecSize)109 void fillWithTrickyNumbers( float *aVectors, float *bVectors, size_t vecSize )
110 {
111     static const cl_float trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
112         MAKE_HEX_FLOAT(0x1.0p63f, 0x1L, 63), MAKE_HEX_FLOAT(0x1.8p63f, 0x18L, 59), MAKE_HEX_FLOAT(0x1.0p64f, 0x1L, 64), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
113         MAKE_HEX_FLOAT(0x1.0p-63f, 0x1L, -63), MAKE_HEX_FLOAT(0x1.8p-63f, 0x18L, -67), MAKE_HEX_FLOAT(0x1.0p-64f, 0x1L, -64), MAKE_HEX_FLOAT(-0x1.0p-63f, -0x1L, -63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p-64f, -0x1L, -64),
114         FLT_MAX / 2.f, -FLT_MAX / 2.f, INFINITY,  -INFINITY, 0.f, -0.f };
115     static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
116     static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
117     size_t i, j, k;
118 
119     for( j = 0; j < vecSize; j++ )
120         for( k = 0; k < vecSize; k++ )
121             for( i = 0; i < trickyCount; i++ )
122                 aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
123 
124     if( bVectors )
125     {
126         size_t copySize = vecSize * vecSize * trickyCount;
127         memset( bVectors, 0, sizeof(float) * copySize );
128         memset( aVectors + copySize, 0, sizeof(float) * copySize );
129         memcpy( bVectors + copySize, aVectors, sizeof(float) * copySize );
130     }
131 }
132 
133 
cross_product(const float * vecA,const float * vecB,float * outVector,float * errorTolerances,float ulpTolerance)134 void cross_product( const float *vecA, const float *vecB, float *outVector, float *errorTolerances, float ulpTolerance )
135 {
136     outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
137     outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
138     outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
139     outVector[ 3 ] = 0.0f;
140 
141     errorTolerances[ 0 ] = fmaxf( fabsf( vecA[ 1 ] ), fmaxf( fabsf( vecB[ 2 ] ), fmaxf( fabsf( vecA[ 2 ] ), fabsf( vecB[ 1 ] ) ) ) );
142     errorTolerances[ 1 ] = fmaxf( fabsf( vecA[ 2 ] ), fmaxf( fabsf( vecB[ 0 ] ), fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 2 ] ) ) ) );
143     errorTolerances[ 2 ] = fmaxf( fabsf( vecA[ 0 ] ), fmaxf( fabsf( vecB[ 1 ] ), fmaxf( fabsf( vecA[ 1 ] ), fabsf( vecB[ 0 ] ) ) ) );
144 
145     errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON );    // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
146     errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
147     errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
148 }
149 
150 
151 
152 
test_geom_cross(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)153 int test_geom_cross(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
154 {
155     int vecsize;
156     RandomSeed seed(gRandomSeed);
157 
158     /* Get the default rounding mode */
159     cl_device_fp_config defaultRoundingMode = get_default_rounding_mode(deviceID);
160     if( 0 == defaultRoundingMode )
161         return -1;
162 
163 
164     for(vecsize = 3; vecsize <= 4; ++vecsize)
165     {
166         clProgramWrapper program;
167         clKernelWrapper kernel;
168         clMemWrapper streams[3];
169         BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
170         BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
171         BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
172         cl_float testVector[4];
173         int error, i;
174         cl_float *inDataA = A;
175         cl_float *inDataB = B;
176         cl_float *outData = C;
177         size_t threads[1], localThreads[1];
178 
179         /* Create kernels */
180         if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSourceV3 : &crossKernelSource, "sample_test" ) )
181             return -1;
182 
183         /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
184         for( i = 0; i < TEST_SIZE * vecsize; i++ )
185         {
186             inDataA[ i ] = get_random_float( -512.f, 512.f, seed );
187             inDataB[ i ] = get_random_float( -512.f, 512.f, seed );
188         }
189         fillWithTrickyNumbers( inDataA, inDataB, vecsize );
190 
191         streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
192                                     sizeof(cl_float) * vecsize * TEST_SIZE,
193                                     inDataA, NULL);
194         if( streams[0] == NULL )
195         {
196             log_error("ERROR: Creating input array A failed!\n");
197             return -1;
198         }
199         streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
200                                     sizeof(cl_float) * vecsize * TEST_SIZE,
201                                     inDataB, NULL);
202         if( streams[1] == NULL )
203         {
204             log_error("ERROR: Creating input array B failed!\n");
205             return -1;
206         }
207         streams[2] =
208             clCreateBuffer(context, CL_MEM_READ_WRITE,
209                            sizeof(cl_float) * vecsize * TEST_SIZE, NULL, NULL);
210         if( streams[2] == NULL )
211         {
212             log_error("ERROR: Creating output array failed!\n");
213             return -1;
214         }
215 
216         /* Assign streams and execute */
217         for( i = 0; i < 3; i++ )
218         {
219             error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
220             test_error( error, "Unable to set indexed kernel arguments" );
221         }
222 
223         /* Run the kernel */
224         threads[0] = TEST_SIZE;
225 
226         error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
227         test_error( error, "Unable to get work group size to use" );
228 
229         error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
230         test_error( error, "Unable to execute test kernel" );
231 
232         /* Now get the results */
233         error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE * vecsize, outData, 0, NULL, NULL );
234         test_error( error, "Unable to read output array!" );
235 
236         /* And verify! */
237         for( i = 0; i < TEST_SIZE; i++ )
238         {
239             float errorTolerances[ 4 ];
240             // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
241             cross_product( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
242 
243         // RTZ devices accrue approximately double the amount of error per operation.  Allow for that.
244         if( defaultRoundingMode == CL_FP_ROUND_TO_ZERO )
245         {
246             errorTolerances[0] *= 2.0f;
247             errorTolerances[1] *= 2.0f;
248             errorTolerances[2] *= 2.0f;
249             errorTolerances[3] *= 2.0f;
250         }
251 
252             float errs[] = { fabsf( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
253                              fabsf( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
254                              fabsf( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
255 
256             if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
257             {
258                 log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
259                           i, testVector[0], testVector[1], testVector[2], testVector[3],
260                           outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
261                 log_error( "    Input: (%a %a %a) and (%a %a %a)\n",
262                           inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
263                           inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
264                 log_error( "    Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
265                           errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
266                 log_error("     ulp %f\n", Ulp_Error( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
267                 return -1;
268             }
269         }
270     } // for(vecsize=...
271 
272     if(!is_extension_available(deviceID, "cl_khr_fp64")) {
273         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
274         return 0;
275     } else {
276         log_info("Testing doubles...\n");
277         return test_geom_cross_double( deviceID,  context,  queue,  num_elements, seed);
278     }
279 }
280 
getMaxValue(float vecA[],float vecB[],size_t vecSize)281 float getMaxValue( float vecA[], float vecB[], size_t vecSize )
282 {
283     float a = fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 0 ] ) );
284     for( size_t i = 1; i < vecSize; i++ )
285         a = fmaxf( fabsf( vecA[ i ] ), fmaxf( fabsf( vecB[ i ] ), a ) );
286     return a;
287 }
288 
289 typedef double (*twoToFloatVerifyFn)( float *srcA, float *srcB, size_t vecSize );
290 
test_twoToFloat_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,twoToFloatVerifyFn verifyFn,float ulpLimit,MTdata d)291 int test_twoToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
292                            size_t vecSize, twoToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
293 {
294     clProgramWrapper program;
295     clKernelWrapper kernel;
296     clMemWrapper streams[3];
297     int error;
298     size_t i, threads[1], localThreads[1];
299     char kernelSource[10240];
300     char *programPtr;
301     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
302     int hasInfNan = 1;
303     cl_device_id device = NULL;
304 
305     error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof( device ), &device, NULL );
306     test_error( error, "Unable to get command queue device" );
307 
308     /* Check for embedded devices doing nutty stuff */
309     error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( kernelSource ), kernelSource, NULL );
310     test_error( error, "Unable to get device profile" );
311     if( 0 == strcmp( kernelSource, "EMBEDDED_PROFILE" ) )
312     {
313         cl_device_fp_config config = 0;
314         error = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( config ), &config, NULL );
315         test_error( error, "Unable to get CL_DEVICE_SINGLE_FP_CONFIG" );
316 
317         if( CL_FP_ROUND_TO_ZERO == (config & (CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO)))
318             ulpLimit *= 2.0f; // rtz operations average twice the accrued error of rte operations
319 
320         if( 0 == (config & CL_FP_INF_NAN) )
321             hasInfNan = 0;
322     }
323 
324     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
325     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * 4));
326     BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE));
327 
328     cl_float *inDataA = A;
329     cl_float *inDataB = B;
330     cl_float *outData = C;
331 
332     /* Create the source */
333     sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPatternV3 : twoToFloatKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
334 
335     /* Create kernels */
336     programPtr = kernelSource;
337     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
338     {
339         return -1;
340     }
341     /* Generate some streams */
342     for( i = 0; i < TEST_SIZE * vecSize; i++ )
343     {
344         inDataA[ i ] = get_random_float( -512.f, 512.f, d );
345         inDataB[ i ] = get_random_float( -512.f, 512.f, d );
346     }
347     fillWithTrickyNumbers( inDataA, inDataB, vecSize );
348 
349     /* Clamp values to be in range for fast_ functions */
350     if( verifyFn == verifyFastDistance )
351     {
352         for( i = 0; i < TEST_SIZE * vecSize; i++ )
353         {
354             if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
355                 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
356             if( fabsf( inDataB[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataB[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
357                 inDataB[ i ] = get_random_float( -512.f, 512.f, d );
358         }
359     }
360 
361 
362     streams[0] =
363         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
364                        sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
365     if( streams[0] == NULL )
366     {
367         log_error("ERROR: Creating input array A failed!\n");
368         return -1;
369     }
370     streams[1] =
371         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
372                        sizeof(cl_float) * vecSize * TEST_SIZE, inDataB, NULL);
373     if( streams[1] == NULL )
374     {
375         log_error("ERROR: Creating input array B failed!\n");
376         return -1;
377     }
378     streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
379                                 sizeof(cl_float) * TEST_SIZE, NULL, NULL);
380     if( streams[2] == NULL )
381     {
382         log_error("ERROR: Creating output array failed!\n");
383         return -1;
384     }
385 
386     /* Assign streams and execute */
387     for( i = 0; i < 3; i++ )
388     {
389         error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
390         test_error( error, "Unable to set indexed kernel arguments" );
391     }
392 
393     /* Run the kernel */
394     threads[0] = TEST_SIZE;
395 
396     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
397     test_error( error, "Unable to get work group size to use" );
398 
399     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
400     test_error( error, "Unable to execute test kernel" );
401 
402     /* Now get the results */
403     error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE, outData, 0, NULL, NULL );
404     test_error( error, "Unable to read output array!" );
405 
406 
407     /* And verify! */
408     int skipCount = 0;
409     for( i = 0; i < TEST_SIZE; i++ )
410     {
411         cl_float *src1 = inDataA + i * vecSize;
412         cl_float *src2 = inDataB + i * vecSize;
413         double expected = verifyFn( src1, src2, vecSize );
414         if( (float) expected != outData[ i ] )
415         {
416             if( isnan(expected) && isnan( outData[i] ) )
417                 continue;
418 
419             if( ! hasInfNan )
420             {
421                 size_t ii;
422                 for( ii = 0; ii < vecSize; ii++ )
423                 {
424                     if( ! isfinite( src1[ii] ) || ! isfinite( src2[ii] ) )
425                     {
426                         skipCount++;
427                         continue;
428                     }
429                 }
430                 if( ! isfinite( (cl_float) expected ) )
431                 {
432                     skipCount++;
433                     continue;
434                 }
435             }
436 
437             if( ulpLimit < 0 )
438             {
439                 // Limit below zero means we need to test via a computed error (like cross product does)
440                 float maxValue =
441                 getMaxValue( inDataA + i * vecSize, inDataB + i * vecSize,vecSize );
442                 // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
443                 float errorTolerance = maxValue * maxValue * ( 2.f * (float)vecSize - 1.f ) * FLT_EPSILON;
444 
445                 // Limit below zero means test via epsilon instead
446                 double error =
447                 fabs( (double)expected - (double)outData[ i ] );
448                 if( error > errorTolerance )
449                 {
450 
451                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
452                               (int)i, (int)vecSize, expected,
453                               outData[ i ],
454                               inDataA[i*vecSize],
455                               inDataB[i*vecSize],
456                               (float)error,
457                               (float)errorTolerance );
458 
459                     char vecA[1000], vecB[1000];
460                     vector2string( vecA, inDataA +i * vecSize, vecSize );
461                     vector2string( vecB, inDataB + i * vecSize, vecSize );
462                     log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
463                     return -1;
464                 }
465             }
466             else
467             {
468                 float error = Ulp_Error( outData[ i ], expected );
469                 if( fabsf(error) > ulpLimit )
470                 {
471                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
472                               (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], inDataB[i*vecSize], error );
473 
474                     char vecA[1000], vecB[1000];
475                     vector2string( vecA, inDataA + i * vecSize, vecSize );
476                     vector2string( vecB, inDataB + i * vecSize, vecSize );
477                     log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
478                     return -1;
479                 }
480             }
481         }
482     }
483 
484     if( skipCount )
485         log_info( "Skipped %d tests out of %d because they contained Infs or NaNs\n\tEMBEDDED_PROFILE Device does not support CL_FP_INF_NAN\n", skipCount, TEST_SIZE );
486 
487     return 0;
488 }
489 
verifyDot(float * srcA,float * srcB,size_t vecSize)490 double verifyDot( float *srcA, float *srcB, size_t vecSize )
491 {
492     double total = 0.f;
493 
494     for( unsigned int i = 0; i < vecSize; i++ )
495         total += (double)srcA[ i ] * (double)srcB[ i ];
496 
497     return total;
498 }
499 
test_geom_dot(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)500 int test_geom_dot(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
501 {
502     size_t sizes[] = { 1, 2, 3, 4, 0 };
503     unsigned int size;
504     int retVal = 0;
505     RandomSeed seed(gRandomSeed);
506 
507     for( size = 0; sizes[ size ] != 0 ; size++ )
508     {
509         if( test_twoToFloat_kernel( queue, context, "dot", sizes[size], verifyDot, -1.0f /*magic value*/, seed ) != 0 )
510         {
511             log_error( "   dot vector size %d FAILED\n", (int)sizes[ size ] );
512             retVal = -1;
513         }
514     }
515 
516     if (retVal)
517         return retVal;
518 
519     if(!is_extension_available(deviceID, "cl_khr_fp64"))
520     {
521         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
522         return 0;
523     }
524 
525     log_info("Testing doubles...\n");
526     return test_geom_dot_double( deviceID,  context,  queue,  num_elements, seed);
527 }
528 
verifyFastDistance(float * srcA,float * srcB,size_t vecSize)529 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize )
530 {
531     double total = 0, value;
532     unsigned int i;
533 
534     // We calculate the distance as a double, to try and make up for the fact that
535     // the GPU has better precision distance since it's a single op
536     for( i = 0; i < vecSize; i++ )
537     {
538         value = (double)srcA[i] - (double)srcB[i];
539         total += value * value;
540     }
541 
542     return sqrt( total );
543 }
544 
test_geom_fast_distance(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)545 int test_geom_fast_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
546 {
547     size_t sizes[] = { 1, 2, 3, 4, 0 };
548     unsigned int size;
549     int retVal = 0;
550     RandomSeed seed(gRandomSeed);
551 
552     for( size = 0; sizes[ size ] != 0 ; size++ )
553     {
554         float maxUlps = 8192.0f +                           // error in sqrt
555         ( 1.5f * (float) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
556          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
557 
558         if( test_twoToFloat_kernel( queue, context, "fast_distance",
559                                    sizes[ size ], verifyFastDistance,
560                                    maxUlps, seed ) != 0 )
561         {
562             log_error( "   fast_distance vector size %d FAILED\n",
563                       (int)sizes[ size ] );
564             retVal = -1;
565         }
566         else
567         {
568             log_info( "   fast_distance vector size %d passed\n",
569                      (int)sizes[ size ] );
570         }
571     }
572     return retVal;
573 }
574 
575 
verifyDistance(float * srcA,float * srcB,size_t vecSize)576 double verifyDistance( float *srcA, float *srcB, size_t vecSize )
577 {
578     double total = 0, value;
579     unsigned int i;
580 
581     // We calculate the distance as a double, to try and make up for the fact that
582     // the GPU has better precision distance since it's a single op
583     for( i = 0; i < vecSize; i++ )
584     {
585         value = (double)srcA[i] - (double)srcB[i];
586         total += value * value;
587     }
588 
589     return sqrt( total );
590 }
591 
test_geom_distance(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)592 int test_geom_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
593 {
594     size_t sizes[] = { 1, 2, 3, 4, 0 };
595     unsigned int size;
596     int retVal = 0;
597     RandomSeed seed(gRandomSeed );
598 
599     for( size = 0; sizes[ size ] != 0 ; size++ )
600     {
601         float maxUlps = 3.0f +                              // error in sqrt
602         ( 1.5f * (float) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
603          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
604 
605         if( test_twoToFloat_kernel( queue, context, "distance", sizes[ size ], verifyDistance, maxUlps, seed ) != 0 )
606         {
607             log_error( "   distance vector size %d FAILED\n",
608                       (int)sizes[ size ] );
609             retVal = -1;
610         }
611         else
612         {
613             log_info( "   distance vector size %d passed\n", (int)sizes[ size ] );
614         }
615     }
616     if (retVal)
617         return retVal;
618 
619     if(!is_extension_available(deviceID, "cl_khr_fp64"))
620     {
621         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
622         return 0;
623     } else {
624         log_info("Testing doubles...\n");
625         return test_geom_distance_double( deviceID,  context,  queue,  num_elements, seed);
626     }
627 }
628 
629 typedef double (*oneToFloatVerifyFn)( float *srcA, size_t vecSize );
630 
test_oneToFloat_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToFloatVerifyFn verifyFn,float ulpLimit,MTdata d)631 int test_oneToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
632                            size_t vecSize, oneToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
633 {
634     clProgramWrapper program;
635     clKernelWrapper kernel;
636     clMemWrapper streams[2];
637     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
638     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE));
639     int error;
640     size_t i, threads[1], localThreads[1];
641     char kernelSource[10240];
642     char *programPtr;
643     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
644     cl_float *inDataA = A;
645     cl_float *outData = B;
646 
647     /* Create the source */
648     sprintf( kernelSource, vecSize == 3? oneToFloatKernelPatternV3 : oneToFloatKernelPattern, sizeNames[vecSize-1], fnName );
649 
650     /* Create kernels */
651     programPtr = kernelSource;
652     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
653     {
654         return -1;
655     }
656 
657     /* Generate some streams */
658     for( i = 0; i < TEST_SIZE * vecSize; i++ )
659     {
660         inDataA[ i ] = get_random_float( -512.f, 512.f, d );
661     }
662     fillWithTrickyNumbers( inDataA, NULL, vecSize );
663 
664     /* Clamp values to be in range for fast_ functions */
665     if( verifyFn == verifyFastLength )
666     {
667         for( i = 0; i < TEST_SIZE * vecSize; i++ )
668         {
669             if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
670                 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
671         }
672     }
673 
674     streams[0] =
675         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
676                        sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
677     if( streams[0] == NULL )
678     {
679         log_error("ERROR: Creating input array A failed!\n");
680         return -1;
681     }
682     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
683                                 sizeof(cl_float) * TEST_SIZE, NULL, NULL);
684     if( streams[1] == NULL )
685     {
686         log_error("ERROR: Creating output array failed!\n");
687         return -1;
688     }
689 
690     /* Assign streams and execute */
691     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
692     test_error( error, "Unable to set indexed kernel arguments" );
693     error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
694     test_error( error, "Unable to set indexed kernel arguments" );
695 
696     /* Run the kernel */
697     threads[0] = TEST_SIZE;
698 
699     error = get_max_common_work_group_size( context, kernel, threads[0],
700                                            &localThreads[0] );
701     test_error( error, "Unable to get work group size to use" );
702 
703     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads,
704                                    localThreads, 0, NULL, NULL );
705     test_error( error, "Unable to execute test kernel" );
706 
707     /* Now get the results */
708     error = clEnqueueReadBuffer( queue, streams[1], true, 0,
709                                 sizeof( cl_float ) * TEST_SIZE, outData,
710                                 0, NULL, NULL );
711     test_error( error, "Unable to read output array!" );
712 
713     /* And verify! */
714     for( i = 0; i < TEST_SIZE; i++ )
715     {
716         double expected = verifyFn( inDataA + i * vecSize, vecSize );
717         if( (float) expected != outData[ i ] )
718         {
719             float ulps = Ulp_Error( outData[i], expected );
720             if( fabsf( ulps ) <= ulpLimit )
721                 continue;
722 
723             // We have to special case NAN
724             if( isnan( outData[ i ] ) && isnan( expected ) )
725                 continue;
726 
727             if(! (fabsf(ulps) < ulpLimit) )
728             {
729                 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
730                           (int)i, (int)vecSize, expected, outData[ i ],  inDataA[i*vecSize], ulps );
731                 char vecA[1000];
732                 vector2string( vecA, inDataA + i *vecSize, vecSize );
733                 log_error( "\tvector: %s", vecA );
734                 return -1;
735             }
736         }
737     }
738 
739     return 0;
740 }
741 
verifyLength(float * srcA,size_t vecSize)742 double verifyLength( float *srcA, size_t vecSize )
743 {
744     double total = 0;
745     unsigned int i;
746 
747     // We calculate the distance as a double, to try and make up for the fact that
748     // the GPU has better precision distance since it's a single op
749     for( i = 0; i < vecSize; i++ )
750     {
751         total += (double)srcA[i] * (double)srcA[i];
752     }
753 
754     return sqrt( total );
755 }
756 
test_geom_length(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)757 int test_geom_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
758 {
759     size_t sizes[] = { 1, 2, 3, 4, 0 };
760     unsigned int size;
761     int retVal = 0;
762     RandomSeed seed( gRandomSeed );
763 
764     for( size = 0; sizes[ size ] != 0 ; size++ )
765     {
766         float maxUlps = 3.0f +                              // error in sqrt
767         0.5f *                              // effect on e of taking sqrt( x + e )
768         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
769          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
770 
771         if( test_oneToFloat_kernel( queue, context, "length", sizes[ size ], verifyLength, maxUlps, seed ) != 0 )
772         {
773             log_error( "   length vector size %d FAILED\n", (int)sizes[ size ] );
774             retVal = -1;
775         }
776         else
777         {
778             log_info( "   length vector vector size %d passed\n", (int)sizes[ size ] );
779         }
780     }
781     if (retVal)
782         return retVal;
783 
784     if(!is_extension_available(deviceID, "cl_khr_fp64"))
785     {
786         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
787         return 0;
788     }
789     else
790     {
791         log_info("Testing doubles...\n");
792         return test_geom_length_double( deviceID,  context,  queue,  num_elements, seed);
793     }
794 }
795 
796 
verifyFastLength(float * srcA,size_t vecSize)797 double verifyFastLength( float *srcA, size_t vecSize )
798 {
799     double total = 0;
800     unsigned int i;
801 
802     // We calculate the distance as a double, to try and make up for the fact that
803     // the GPU has better precision distance since it's a single op
804     for( i = 0; i < vecSize; i++ )
805     {
806         total += (double)srcA[i] * (double)srcA[i];
807     }
808 
809     return sqrt( total );
810 }
811 
test_geom_fast_length(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)812 int test_geom_fast_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
813 {
814     size_t sizes[] = { 1, 2, 3, 4, 0 };
815     unsigned int size;
816     int retVal = 0;
817     RandomSeed seed(gRandomSeed);
818 
819     for( size = 0; sizes[ size ] != 0 ; size++ )
820     {
821         float maxUlps = 8192.0f +                           // error in half_sqrt
822         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
823          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
824 
825         if( test_oneToFloat_kernel( queue, context, "fast_length", sizes[ size ], verifyFastLength, maxUlps, seed ) != 0 )
826         {
827             log_error( "   fast_length vector size %d FAILED\n", (int)sizes[ size ] );
828             retVal = -1;
829         }
830         else
831         {
832             log_info( "   fast_length vector size %d passed\n", (int)sizes[ size ] );
833         }
834     }
835     return retVal;
836 }
837 
838 
839 typedef void (*oneToOneVerifyFn)( float *srcA, float *dstA, size_t vecSize );
840 
841 
test_oneToOne_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToOneVerifyFn verifyFn,float ulpLimit,int softball,MTdata d)842 int test_oneToOne_kernel(cl_command_queue queue, cl_context context, const char *fnName,
843                          size_t vecSize, oneToOneVerifyFn verifyFn, float ulpLimit, int softball, MTdata d )
844 {
845     clProgramWrapper program;
846     clKernelWrapper kernel;
847     clMemWrapper streams[2];
848     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE
849                                        * vecSize));
850     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE
851                                        * vecSize));
852     int error;
853     size_t i, j, threads[1], localThreads[1];
854     char kernelSource[10240];
855     char *programPtr;
856     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
857     cl_float *inDataA = A;
858     cl_float *outData = B;
859     float ulp_error = 0;
860 
861     /* Create the source */
862     sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPatternV3: oneToOneKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
863 
864     /* Create kernels */
865     programPtr = kernelSource;
866     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr,  "sample_test" ) )
867         return -1;
868 
869     /* Initialize data.  First element always 0. */
870     memset( inDataA, 0, sizeof(cl_float) * vecSize );
871     if( 0 == strcmp( fnName, "fast_normalize" ))
872     { // keep problematic cases out of the fast function
873         for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
874         {
875             cl_float z = get_random_float( -MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), d);
876             if( fabsf(z) < MAKE_HEX_FLOAT( 0x1.0p-60f, 1, -60) )
877                 z = copysignf( 0.0f, z );
878             inDataA[i] = z;
879         }
880     }
881     else
882     {
883         for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
884             inDataA[i] = any_float(d);
885     }
886 
887     streams[0] =
888         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
889                        sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
890     if( streams[0] == NULL )
891     {
892         log_error("ERROR: Creating input array A failed!\n");
893         return -1;
894     }
895     streams[1] =
896         clCreateBuffer(context, CL_MEM_READ_WRITE,
897                        sizeof(cl_float) * vecSize * TEST_SIZE, NULL, NULL);
898     if( streams[1] == NULL )
899     {
900         log_error("ERROR: Creating output array failed!\n");
901         return -1;
902     }
903 
904     /* Assign streams and execute */
905     error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
906     test_error( error, "Unable to set indexed kernel arguments" );
907     error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
908     test_error( error, "Unable to set indexed kernel arguments" );
909 
910     /* Run the kernel */
911     threads[0] = TEST_SIZE;
912 
913     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
914     test_error( error, "Unable to get work group size to use" );
915 
916     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
917     test_error( error, "Unable to execute test kernel" );
918 
919     /* Now get the results */
920     error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_float ) * TEST_SIZE  * vecSize, outData, 0, NULL, NULL );
921     test_error( error, "Unable to read output array!" );
922 
923     /* And verify! */
924     for( i = 0; i < TEST_SIZE; i++ )
925     {
926         float expected[4];
927         int fail = 0;
928         verifyFn( inDataA + i * vecSize, expected, vecSize );
929         for( j = 0; j < vecSize; j++ )
930         {
931             // We have to special case NAN
932             if( isnan( outData[ i * vecSize + j ] )
933                && isnan( expected[ j ] ) )
934                 continue;
935 
936             if( expected[j] != outData[ i * vecSize + j ] ) {
937                 ulp_error = Ulp_Error(  outData[i*vecSize+j], expected[ j ] );
938 
939                 if( fabsf(ulp_error) > ulpLimit ) {
940                     fail = 1;
941                     break;
942                 }
943             }
944 
945         }
946 
947         // try again with subnormals flushed to zero if the platform flushes
948         if( fail && gFlushDenormsToZero )
949         {
950             float temp[4], expected2[4];
951             for( j = 0; j < vecSize; j++ )
952             {
953                 if( IsFloatSubnormal(inDataA[i*vecSize+j] ) )
954                     temp[j] = copysignf( 0.0f, inDataA[i*vecSize+j] );
955                 else
956                     temp[j] = inDataA[ i*vecSize +j];
957             }
958 
959             verifyFn( temp, expected2, vecSize );
960             fail = 0;
961 
962             for( j = 0; j < vecSize; j++ )
963             {
964                 // We have to special case NAN
965                 if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
966                     continue;
967 
968                 if( expected2[j] != outData[ i * vecSize + j ] )
969                 {
970                     ulp_error = Ulp_Error(outData[i*vecSize + j ], expected[ j ]  );
971 
972                     if( fabsf(ulp_error) > ulpLimit )
973                     {
974                         if( IsFloatSubnormal(expected2[j]) )
975                         {
976                             expected2[j] = 0.0f;
977                             if( expected2[j] !=  outData[i*vecSize + j ] )
978                             {
979                                 ulp_error = Ulp_Error(  outData[ i * vecSize + j ], expected[ j ] );
980                                 if( fabsf(ulp_error) > ulpLimit ) {
981                                     fail = 1;
982                                     break;
983                                 }
984                             }
985                         }
986                     }
987                 }
988             }
989         }
990 
991         if( fail )
992         {
993             log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
994                       (int)i, (int)j, (int)vecSize, expected[j], expected[j], outData[ i*vecSize+j], outData[ i*vecSize+j], ulp_error );
995             log_error( "       Source: " );
996             for( size_t q = 0; q < vecSize; q++ )
997                 log_error( "%g ", inDataA[ i * vecSize+q]);
998             log_error( "\n             : " );
999             for( size_t q = 0; q < vecSize; q++ )
1000                 log_error( "%a ", inDataA[i*vecSize +q] );
1001             log_error( "\n" );
1002             log_error( "       Result: " );
1003             for( size_t q = 0; q < vecSize; q++ )
1004                 log_error( "%g ", outData[ i *vecSize + q ] );
1005             log_error( "\n             : " );
1006             for( size_t q = 0; q < vecSize; q++ )
1007                 log_error( "%a ", outData[ i * vecSize + q ] );
1008             log_error( "\n" );
1009             log_error( "       Expected: " );
1010             for( size_t q = 0; q < vecSize; q++ )
1011                 log_error( "%g ", expected[ q ] );
1012             log_error( "\n             : " );
1013             for( size_t q = 0; q < vecSize; q++ )
1014                 log_error( "%a ", expected[ q ] );
1015             log_error( "\n" );
1016             return -1;
1017         }
1018     }
1019 
1020     return 0;
1021 }
1022 
verifyNormalize(float * srcA,float * dst,size_t vecSize)1023 void verifyNormalize( float *srcA, float *dst, size_t vecSize )
1024 {
1025     double total = 0, value;
1026     unsigned int i;
1027 
1028     // We calculate everything as a double, to try and make up for the fact that
1029     // the GPU has better precision distance since it's a single op
1030     for( i = 0; i < vecSize; i++ )
1031         total += (double)srcA[i] * (double)srcA[i];
1032 
1033     if( total == 0.f )
1034     {
1035         // Special edge case: copy vector over without change
1036         for( i = 0; i < vecSize; i++ )
1037             dst[i] = srcA[i];
1038         return;
1039     }
1040 
1041     // Deal with infinities
1042     if( total == INFINITY )
1043     {
1044         total = 0.0f;
1045         for( i = 0; i < vecSize; i++ )
1046         {
1047             if( fabsf( srcA[i]) == INFINITY )
1048                 dst[i] = copysignf( 1.0f, srcA[i] );
1049             else
1050                 dst[i] = copysignf( 0.0f, srcA[i] );
1051             total += (double)dst[i] * (double)dst[i];
1052         }
1053 
1054         srcA = dst;
1055     }
1056 
1057     value = sqrt( total );
1058     for( i = 0; i < vecSize; i++ )
1059         dst[i] = (float)( (double)srcA[i] / value );
1060 }
1061 
test_geom_normalize(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1062 int test_geom_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1063 {
1064     size_t sizes[] = { 1, 2, 3, 4, 0 };
1065     unsigned int size;
1066     int retVal = 0;
1067     RandomSeed seed(gRandomSeed);
1068 
1069     for( size = 0; sizes[ size ] != 0 ; size++ )
1070     {
1071         float maxUlps = 2.5f +                              // error in rsqrt + error in multiply
1072         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
1073          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
1074         if( test_oneToOne_kernel( queue, context, "normalize", sizes[ size ], verifyNormalize, maxUlps, 0, seed ) != 0 )
1075         {
1076             log_error( "   normalized vector size %d FAILED\n", (int)sizes[ size ] );
1077             retVal = -1;
1078         }
1079         else
1080         {
1081             log_info( "   normalized vector size %d passed\n", (int)sizes[ size ] );
1082         }
1083     }
1084     if (retVal)
1085         return retVal;
1086 
1087     if(!is_extension_available(deviceID, "cl_khr_fp64"))
1088     {
1089         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
1090         return 0;
1091     } else {
1092         log_info("Testing doubles...\n");
1093         return test_geom_normalize_double( deviceID,  context,  queue,  num_elements, seed);
1094     }
1095 }
1096 
1097 
test_geom_fast_normalize(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1098 int test_geom_fast_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1099 {
1100     size_t sizes[] = { 1, 2, 3, 4, 0 };
1101     unsigned int size;
1102     int retVal = 0;
1103     RandomSeed seed( gRandomSeed );
1104 
1105     for( size = 0; sizes[ size ] != 0 ; size++ )
1106     {
1107         float maxUlps = 8192.5f +                           // error in rsqrt + error in multiply
1108         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
1109          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
1110 
1111         if( test_oneToOne_kernel( queue, context, "fast_normalize", sizes[ size ], verifyNormalize, maxUlps, 1, seed ) != 0 )
1112         {
1113             log_error( "   fast_normalize vector size %d FAILED\n", (int)sizes[ size ] );
1114             retVal = -1;
1115         }
1116         else
1117         {
1118             log_info( "   fast_normalize vector size %d passed\n", (int)sizes[ size ] );
1119         }
1120     }
1121     return retVal;
1122 }
1123 
1124 
1125 
1126