1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/testHarness.h"
18 
19 #include <string.h>
20 #include "cl_utils.h"
21 #include "tests.h"
22 
23 #include <CL/cl_half.h>
24 
Test_vLoadHalf_private(cl_device_id device,bool aligned)25 int Test_vLoadHalf_private( cl_device_id device, bool aligned )
26 {
27     cl_int error;
28     int vectorSize;
29     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
30     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
31     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33     size_t q;
34 
35     memset( min_time, -1, sizeof( min_time ) );
36 
37     const char *vector_size_names[]   = {"1", "2", "4", "8", "16", "3"};
38 
39     int minVectorSize = kMinVectorSize;
40     // There is no aligned scalar vloada_half in CL 1.1
41 #if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__)
42     vlog("Note: testing vloada_half.\n");
43     if (aligned && minVectorSize == 0)
44         minVectorSize = 1;
45 #endif
46 
47     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
48     {
49 
50         int effectiveVectorSize = g_arrVecSizes[vectorSize];
51         if(effectiveVectorSize == 3 && aligned) {
52             effectiveVectorSize = 4;
53         }
54         const char *source[] = {
55             "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
56             "{\n"
57             "   size_t i = get_global_id(0);\n"
58             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
59             "}\n"
60         };
61 
62         const char *sourceV3[] = {
63             "__kernel void test( const __global half *p, __global float *f,\n"
64             "                   uint extra_last_thread)\n"
65             "{\n"
66             "   size_t i = get_global_id(0);\n"
67             "   size_t last_i = get_global_size(0)-1;\n"
68             "   if(last_i == i && extra_last_thread != 0) {\n"
69             "     if(extra_last_thread ==2) {\n"
70             "       f[3*i+1] = vload_half(3*i+1, p);\n"
71             "     }\n"
72             "     f[3*i] = vload_half(3*i, p);\n"
73             "   } else {\n"
74             "     vstore3(vload_half3( i, p ),i,f);\n"
75             "   }\n"
76             "}\n"
77         };
78 
79         const char *sourceV3aligned[] = {
80             "__kernel void test( const __global half *p, __global float3 *f )\n"
81             "{\n"
82             "   size_t i = get_global_id(0);\n"
83             "   f[i] = vloada_half3( i, p );\n"
84             "   ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n"
85             "}\n"
86         };
87 
88         const char *source_private1[] = {
89             "__kernel void test( const __global half *p, __global float *f )\n"
90             "{\n"
91             "   __private ushort data[1];\n"
92             "   __private half* hdata_p = (__private half*) data;\n"
93             "   size_t i = get_global_id(0);\n"
94             "   data[0] = ((__global ushort*)p)[i];\n"
95             "   f[i] = vload", (aligned ? "a" : ""), "_half( 0, hdata_p );\n"
96             "}\n"
97         };
98 
99         const char *source_private2[] = {
100             "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
101             "{\n"
102             "   __private ", align_types[vectorSize], " data[", vector_size_names[vectorSize], "/", align_divisors[vectorSize], "];\n"
103             "   __private half* hdata_p = (__private half*) data;\n"
104             "   __global  ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize], "*)p;\n"
105             "   size_t i = get_global_id(0);\n"
106             "   int k;\n"
107             "   for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
108             "     data[k] = i_p[i+k];\n"
109             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( 0, hdata_p );\n"
110             "}\n"
111         };
112 
113         const char *source_privateV3[] = {
114             "__kernel void test( const __global half *p, __global float *f,"
115             "                    uint extra_last_thread )\n"
116             "{\n"
117             "   __private ushort data[3];\n"
118             "   __private half* hdata_p = (__private half*) data;\n"
119             "   __global  ushort* i_p = (__global  ushort*)p;\n"
120             "   size_t i = get_global_id(0);\n"
121             "   int k;\n"
122             //        "   data = vload3(i, i_p);\n"
123             "   size_t last_i = get_global_size(0)-1;\n"
124             "   if(last_i == i && extra_last_thread != 0) {\n"
125             "     if(extra_last_thread ==2) {\n"
126             "       f[3*i+1] = vload_half(3*i+1, p);\n"
127             "     }\n"
128             "     f[3*i] = vload_half(3*i, p);\n"
129             "   } else {\n"
130             "     for (k=0; k<3; k++)\n"
131             "       data[k] = i_p[i*3+k];\n"
132             "     vstore3(vload_half3( 0, hdata_p ), i, f);\n"
133             "   }\n"
134             "}\n"
135         };
136 
137         const char *source_privateV3aligned[] = {
138             "__kernel void test( const __global half *p, __global float3 *f )\n"
139             "{\n"
140             "   ushort4 data[4];\n"  // declare as vector for alignment. Make four to check to see vloada_half3 index is working.
141             "   half* hdata_p = (half*) &data;\n"
142             "   size_t i = get_global_id(0);\n"
143             "   global  ushort* i_p = (global  ushort*)p + i * 4;\n"
144             "   int offset = i & 3;\n"
145             "   data[offset] = (ushort4)( i_p[0], i_p[1], i_p[2], USHRT_MAX ); \n"
146             "   data[offset^1] = USHRT_MAX; \n"
147             "   data[offset^2] = USHRT_MAX; \n"
148             "   data[offset^3] = USHRT_MAX; \n"
149             //  test vloada_half3
150             "   f[i] = vloada_half3( offset, hdata_p );\n"
151             //  Fill in the 4th value so we don't have to special case this code elsewhere in the test.
152             "   mem_fence(CLK_GLOBAL_MEM_FENCE );\n"
153             "   ((__global float *)f)[4*i+3] = vload_half(4*i+3, p);\n"
154             "}\n"
155         };
156 
157         char local_buf_size[10];
158 
159         sprintf(local_buf_size, "%lld", (uint64_t)((effectiveVectorSize))*gWorkGroupSize);
160         const char *source_local1[] = {
161             "__kernel void test( const __global half *p, __global float *f )\n"
162             "{\n"
163             "   __local ushort data[",local_buf_size,"];\n"
164             "   __local half* hdata_p = (__local half*) data;\n"
165             "   size_t i = get_global_id(0);\n"
166             "   size_t lid = get_local_id(0);\n"
167             "   data[lid] = ((__global ushort*)p)[i];\n"
168             "   f[i] = vload", aligned ? "a" : "", "_half( lid, hdata_p );\n"
169             "}\n"
170         };
171 
172         const char *source_local2[] = {
173             "#define VECTOR_LEN (",
174             vector_size_names[vectorSize],
175             "/",
176             align_divisors[vectorSize],
177             ")\n"
178             "#define ALIGN_TYPE ",
179             align_types[vectorSize],
180             "\n"
181             "__kernel void test( const __global half *p, __global float",
182             vector_size_name_extensions[vectorSize],
183             " *f )\n"
184             "{\n"
185             "   __local uchar data[",
186             local_buf_size,
187             "/",
188             align_divisors[vectorSize],
189             "*sizeof(ALIGN_TYPE)] ",
190             "__attribute__((aligned(sizeof(ALIGN_TYPE))));\n"
191             "   __local half* hdata_p = (__local half*) data;\n"
192             "   __global ALIGN_TYPE* i_p = (__global ALIGN_TYPE*)p;\n"
193             "   size_t i = get_global_id(0);\n"
194             "   size_t lid = get_local_id(0);\n"
195             "   int k;\n"
196             "   for (k=0; k<VECTOR_LEN; k++)\n"
197             "     *(__local ",
198             "ALIGN_TYPE*)&(data[(lid*VECTOR_LEN+k)*sizeof(ALIGN_TYPE)]) = ",
199             "i_p[i*VECTOR_LEN+k];\n"
200             "   f[i] = vload",
201             aligned ? "a" : "",
202             "_half",
203             vector_size_name_extensions[vectorSize],
204             "( lid, hdata_p );\n"
205             "}\n"
206         };
207 
208         const char *source_localV3[] = {
209             "__kernel void test( const __global half *p, __global float *f,\n"
210             "                    uint extra_last_thread)\n"
211             "{\n"
212             "   __local ushort data[", local_buf_size,"];\n"
213             "   __local half* hdata_p = (__local half*) data;\n"
214             "   __global  ushort* i_p = (__global  ushort*)p;\n"
215             "   size_t i = get_global_id(0);\n"
216             "   size_t last_i = get_global_size(0)-1;\n"
217             "   size_t lid = get_local_id(0);\n"
218             "   int k;\n"
219             "   if(last_i == i && extra_last_thread != 0) {\n"
220             "     if(extra_last_thread ==2) {\n"
221             "       f[3*i+1] = vload_half(3*i+1, p);\n"
222             "     }\n"
223             "     f[3*i] = vload_half(3*i, p);\n"
224             "   } else {\n"
225             "     for (k=0; k<3; k++)\n"
226             "       data[lid*3+k] = i_p[i*3+k];\n"
227             "     vstore3( vload_half3( lid, hdata_p ),i,f);\n"
228             "   };\n"
229             "}\n"
230         };
231 
232         const char *source_localV3aligned[] = {
233             "__kernel void test( const __global half *p, __global float3 *f )\n"
234             "{\n"
235             "   __local ushort data[", local_buf_size,"];\n"
236             "   __local half* hdata_p = (__local half*) data;\n"
237             "   __global  ushort* i_p = (__global  ushort*)p;\n"
238             "   size_t i = get_global_id(0);\n"
239             "   size_t lid = get_local_id(0);\n"
240             "   int k;\n"
241             "   for (k=0; k<4; k++)\n"
242             "     data[lid*4+k] = i_p[i*4+k];\n"
243             "   f[i] = vloada_half3( lid, hdata_p );\n"
244             "   ((__global float *)f)[4*i+3] = vload_half(lid*4+3, hdata_p);\n"
245             "}\n"
246         };
247 
248         const char *source_constant[] = {
249             "__kernel void test( __constant half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
250             "{\n"
251             "   size_t i = get_global_id(0);\n"
252             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
253             "}\n"
254         };
255 
256         const char *source_constantV3[] = {
257             "__kernel void test( __constant half *p, __global float *f,\n"
258             "                    uint extra_last_thread)\n"
259             "{\n"
260             "   size_t i = get_global_id(0);\n"
261             "   size_t last_i = get_global_size(0)-1;\n"
262             "   if(last_i == i && extra_last_thread != 0) {\n"
263             "     if(extra_last_thread ==2) {\n"
264             "       f[3*i+1] = vload_half(3*i+1, p);\n"
265             "     }\n"
266             "     f[3*i] = vload_half(3*i, p);\n"
267             "   } else {\n"
268             "     vstore3(vload_half",vector_size_name_extensions[vectorSize],"( i, p ), i, f);\n"
269             "   }\n"
270             "}\n"
271         };
272 
273         const char *source_constantV3aligned[] = {
274             "__kernel void test( __constant half *p, __global float3 *f )\n"
275             "{\n"
276             "   size_t i = get_global_id(0);\n"
277             "   f[i] = vloada_half3( i, p );\n"
278             "   ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
279             "}\n"
280         };
281 
282 
283         if(g_arrVecSizes[vectorSize] != 3) {
284             programs[vectorSize][AS_Global] = MakeProgram( device, source, sizeof( source) / sizeof( source[0])  );
285             if( NULL == programs[ vectorSize ][AS_Global] ) {
286                 gFailCount++;
287                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
288                 for ( q= 0; q < sizeof( source) / sizeof( source[0]); q++)
289                     vlog_error("%s", source[q]);
290                 return -1;
291             } else {
292             }
293         } else if(aligned) {
294             programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3aligned, sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0])  );
295             if( NULL == programs[ vectorSize ][AS_Global] ) {
296                 gFailCount++;
297                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
298                 for ( q= 0; q < sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]); q++)
299                     vlog_error("%s", sourceV3aligned[q]);
300                 return -1;
301             } else {
302             }
303         } else {
304             programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0])  );
305             if( NULL == programs[ vectorSize ][AS_Global] ) {
306                 gFailCount++;
307                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
308                 for ( q= 0; q < sizeof( sourceV3) / sizeof( sourceV3[0]); q++)
309                     vlog_error("%s", sourceV3[q]);
310                 return -1;
311             }
312         }
313 
314         kernels[ vectorSize ][AS_Global] = clCreateKernel( programs[ vectorSize ][AS_Global], "test", &error );
315         if( NULL == kernels[vectorSize][AS_Global] )
316         {
317             gFailCount++;
318             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
319             return -2;
320         }
321 
322         const char** source_ptr;
323         uint32_t source_size;
324         if (vectorSize == 0) {
325             source_ptr = source_private1;
326             source_size = sizeof( source_private1) / sizeof( source_private1[0]);
327         } else if(g_arrVecSizes[vectorSize] == 3) {
328             if(aligned) {
329                 source_ptr = source_privateV3aligned;
330                 source_size = sizeof( source_privateV3aligned) / sizeof( source_privateV3aligned[0]);
331             } else {
332                 source_ptr = source_privateV3;
333                 source_size = sizeof( source_privateV3) / sizeof( source_privateV3[0]);
334             }
335         } else {
336             source_ptr = source_private2;
337             source_size = sizeof( source_private2) / sizeof( source_private2[0]);
338         }
339         programs[vectorSize][AS_Private] = MakeProgram( device, source_ptr, source_size );
340         if( NULL == programs[ vectorSize ][AS_Private] )
341         {
342             gFailCount++;
343             vlog_error( "\t\tFAILED -- Failed to create private program.\n" );
344             for ( q= 0; q < source_size; q++)
345                 vlog_error("%s", source_ptr[q]);
346             return -1;
347         }
348 
349         kernels[ vectorSize ][AS_Private] = clCreateKernel( programs[ vectorSize ][AS_Private], "test", &error );
350         if( NULL == kernels[vectorSize][AS_Private] )
351         {
352             gFailCount++;
353             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
354             return -2;
355         }
356 
357         if (vectorSize == 0) {
358             source_ptr = source_local1;
359             source_size = sizeof( source_local1) / sizeof( source_local1[0]);
360         } else if(g_arrVecSizes[vectorSize] == 3) {
361             if(aligned) {
362                 source_ptr = source_localV3aligned;
363                 source_size = sizeof(source_localV3aligned)/sizeof(source_localV3aligned[0]);
364             } else  {
365                 source_ptr = source_localV3;
366                 source_size = sizeof(source_localV3)/sizeof(source_localV3[0]);
367             }
368         } else {
369             source_ptr = source_local2;
370             source_size = sizeof( source_local2) / sizeof( source_local2[0]);
371         }
372         programs[vectorSize][AS_Local] = MakeProgram( device, source_ptr, source_size );
373         if( NULL == programs[ vectorSize ][AS_Local] )
374         {
375             gFailCount++;
376             vlog_error( "\t\tFAILED -- Failed to create local program.\n" );
377             for ( q= 0; q < source_size; q++)
378                 vlog_error("%s", source_ptr[q]);
379             return -1;
380         }
381 
382         kernels[ vectorSize ][AS_Local] = clCreateKernel( programs[ vectorSize ][AS_Local], "test", &error );
383         if( NULL == kernels[vectorSize][AS_Local] )
384         {
385             gFailCount++;
386             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
387             return -2;
388         }
389 
390         if(g_arrVecSizes[vectorSize] == 3) {
391             if(aligned) {
392                 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3aligned, sizeof(source_constantV3aligned) / sizeof( source_constantV3aligned[0])  );
393                 if( NULL == programs[ vectorSize ][AS_Constant] )
394                 {
395                     gFailCount++;
396                     vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
397                     for ( q= 0; q < sizeof( source_constantV3aligned) / sizeof( source_constantV3aligned[0]); q++)
398                         vlog_error("%s", source_constantV3aligned[q]);
399                     return -1;
400                 }
401             } else {
402                 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3, sizeof(source_constantV3) / sizeof( source_constantV3[0])  );
403                 if( NULL == programs[ vectorSize ][AS_Constant] )
404                 {
405                     gFailCount++;
406                     vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
407                     for ( q= 0; q < sizeof( source_constantV3) / sizeof( source_constantV3[0]); q++)
408                         vlog_error("%s", source_constantV3[q]);
409                     return -1;
410                 }
411             }
412         } else {
413             programs[vectorSize][AS_Constant] = MakeProgram( device, source_constant, sizeof(source_constant) / sizeof( source_constant[0])  );
414             if( NULL == programs[ vectorSize ][AS_Constant] )
415             {
416                 gFailCount++;
417                 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
418                 for ( q= 0; q < sizeof( source_constant) / sizeof( source_constant[0]); q++)
419                     vlog_error("%s", source_constant[q]);
420                 return -1;
421             }
422         }
423 
424         kernels[ vectorSize ][AS_Constant] = clCreateKernel( programs[ vectorSize ][AS_Constant], "test", &error );
425         if( NULL == kernels[vectorSize][AS_Constant] )
426         {
427             gFailCount++;
428             vlog_error( "\t\tFAILED -- Failed to create constant kernel. (%d)\n", error );
429             return -2;
430         }
431     }
432 
433     // Figure out how many elements are in a work block
434     size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
435     size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
436     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
437 
438     // we handle 64-bit types a bit differently.
439     if( lastCase == 0 )
440         lastCase = 0x100000000ULL;
441 
442 
443     uint64_t i, j;
444     uint64_t printMask = (lastCase >> 4) - 1;
445     uint32_t count = 0;
446     error = 0;
447     int addressSpace;
448     //    int reported_vector_skip = 0;
449 
450     for( i = 0; i < (uint64_t)lastCase; i += blockCount )
451     {
452         count = (uint32_t) MIN( blockCount, lastCase - i );
453 
454         //Init the input stream
455         uint16_t *p = (uint16_t *)gIn_half;
456         for( j = 0; j < count; j++ )
457             p[j] = j + i;
458 
459         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)))
460         {
461             vlog_error( "Failure in clWriteArray\n" );
462             gFailCount++;
463             goto exit;
464         }
465 
466         //create the reference result
467         const unsigned short *s = (const unsigned short *)gIn_half;
468         float *d = (float *)gOut_single_reference;
469         for (j = 0; j < count; j++) d[j] = cl_half_to_float(s[j]);
470 
471         //Check the vector lengths
472         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
473         { // here we loop through vector sizes, 3 is last
474 
475             for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
476                 uint32_t pattern = 0x7fffdead;
477 
478                 /*
479                  if (addressSpace == 3) {
480                  vlog("Note: skipping address space %s due to small buffer size.\n", addressSpaceNames[addressSpace]);
481                  continue;
482                  }
483                  */
484                 memset_pattern4( gOut_single, &pattern, getBufferSize(device));
485                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
486                 {
487                     vlog_error( "Failure in clWriteArray\n" );
488                     gFailCount++;
489                     goto exit;
490                 }
491 
492                 if(g_arrVecSizes[vectorSize] == 3 && !aligned) {
493                     // now we need to add the extra const argument for how
494                     // many elements the last thread should take care of.
495                 }
496 
497                 // okay, here is where we have to be careful
498                 if( (error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
499                                        runsOverBy(count, vectorSize, aligned) ) ) )
500                 {
501                     gFailCount++;
502                     goto exit;
503                 }
504 
505                 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
506                 {
507                     vlog_error( "Failure in clReadArray\n" );
508                     gFailCount++;
509                     goto exit;
510                 }
511 
512                 if( memcmp( gOut_single, gOut_single_reference, count * sizeof( float )) )
513                 {
514                     uint32_t *u1 = (uint32_t *)gOut_single;
515                     uint32_t *u2 = (uint32_t *)gOut_single_reference;
516                     float *f1 = (float *)gOut_single;
517                     float *f2 = (float *)gOut_single_reference;
518                     for( j = 0; j < count; j++ )
519                     {
520                         if(isnan(f1[j]) && isnan(f2[j])) // both are nan dont compare them
521                             continue;
522                         if( u1[j] != u2[j])
523                         {
524                             vlog_error( " %lld)  (of %lld) Failure at 0x%4.4x:  %a vs *%a  (0x%8.8x vs *0x%8.8x)  vector_size = %d (%s) address space = %s, load is %s\n",
525                                        j, (uint64_t)count, ((unsigned short*)gIn_half)[j], f1[j], f2[j], u1[j], u2[j], (g_arrVecSizes[vectorSize]),
526                                        vector_size_names[vectorSize], addressSpaceNames[addressSpace],
527                                        (aligned?"aligned":"unaligned"));
528                             gFailCount++;
529                             error = -1;
530                             goto exit;
531                         }
532                     }
533                 }
534 
535                 if( gReportTimes && addressSpace == 0)
536                 {
537                     //Run again for timing
538                     for( j = 0; j < 100; j++ )
539                     {
540                         uint64_t startTime = ReadTime();
541                         error =
542                         RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
543                                   runsOverBy(count, vectorSize, aligned));
544                         if(error)
545                         {
546                             gFailCount++;
547                             goto exit;
548                         }
549 
550                         if( (error = clFinish(gQueue)) )
551                         {
552                             vlog_error( "Failure in clFinish\n" );
553                             gFailCount++;
554                             goto exit;
555                         }
556                         uint64_t currentTime = ReadTime() - startTime;
557                         time[ vectorSize ] += currentTime;
558                         if( currentTime < min_time[ vectorSize ] )
559                             min_time[ vectorSize ] = currentTime ;
560                     }
561                 }
562             }
563         }
564 
565         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
566         {
567             vlog( "." );
568             fflush( stdout );
569         }
570     }
571 
572     vlog( "\n" );
573 
574     if( gReportTimes )
575     {
576         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
577             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * 100), 0,
578                       "average us/elem", "vLoad%sHalf avg. (%s, vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
579         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
580             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
581                       "best us/elem", "vLoad%sHalf best (%s vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
582     }
583 
584 exit:
585     //clean up
586     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
587     {
588         for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
589             clReleaseKernel( kernels[ vectorSize ][addressSpace] );
590             clReleaseProgram( programs[ vectorSize ][addressSpace] );
591         }
592     }
593 
594     return error;
595 }
596 
test_vload_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)597 int test_vload_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
598 {
599     return Test_vLoadHalf_private( device, false );
600 }
601 
test_vloada_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)602 int test_vloada_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
603 {
604     return Test_vLoadHalf_private( device, true );
605 }
606 
607