1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/testHarness.h"
18
19 #include <string.h>
20 #include "cl_utils.h"
21 #include "tests.h"
22
23 #include <CL/cl_half.h>
24
Test_vLoadHalf_private(cl_device_id device,bool aligned)25 int Test_vLoadHalf_private( cl_device_id device, bool aligned )
26 {
27 cl_int error;
28 int vectorSize;
29 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
30 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
31 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33 size_t q;
34
35 memset( min_time, -1, sizeof( min_time ) );
36
37 const char *vector_size_names[] = {"1", "2", "4", "8", "16", "3"};
38
39 int minVectorSize = kMinVectorSize;
40 // There is no aligned scalar vloada_half in CL 1.1
41 #if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__)
42 vlog("Note: testing vloada_half.\n");
43 if (aligned && minVectorSize == 0)
44 minVectorSize = 1;
45 #endif
46
47 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
48 {
49
50 int effectiveVectorSize = g_arrVecSizes[vectorSize];
51 if(effectiveVectorSize == 3 && aligned) {
52 effectiveVectorSize = 4;
53 }
54 const char *source[] = {
55 "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
56 "{\n"
57 " size_t i = get_global_id(0);\n"
58 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
59 "}\n"
60 };
61
62 const char *sourceV3[] = {
63 "__kernel void test( const __global half *p, __global float *f,\n"
64 " uint extra_last_thread)\n"
65 "{\n"
66 " size_t i = get_global_id(0);\n"
67 " size_t last_i = get_global_size(0)-1;\n"
68 " if(last_i == i && extra_last_thread != 0) {\n"
69 " if(extra_last_thread ==2) {\n"
70 " f[3*i+1] = vload_half(3*i+1, p);\n"
71 " }\n"
72 " f[3*i] = vload_half(3*i, p);\n"
73 " } else {\n"
74 " vstore3(vload_half3( i, p ),i,f);\n"
75 " }\n"
76 "}\n"
77 };
78
79 const char *sourceV3aligned[] = {
80 "__kernel void test( const __global half *p, __global float3 *f )\n"
81 "{\n"
82 " size_t i = get_global_id(0);\n"
83 " f[i] = vloada_half3( i, p );\n"
84 " ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n"
85 "}\n"
86 };
87
88 const char *source_private1[] = {
89 "__kernel void test( const __global half *p, __global float *f )\n"
90 "{\n"
91 " __private ushort data[1];\n"
92 " __private half* hdata_p = (__private half*) data;\n"
93 " size_t i = get_global_id(0);\n"
94 " data[0] = ((__global ushort*)p)[i];\n"
95 " f[i] = vload", (aligned ? "a" : ""), "_half( 0, hdata_p );\n"
96 "}\n"
97 };
98
99 const char *source_private2[] = {
100 "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
101 "{\n"
102 " __private ", align_types[vectorSize], " data[", vector_size_names[vectorSize], "/", align_divisors[vectorSize], "];\n"
103 " __private half* hdata_p = (__private half*) data;\n"
104 " __global ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize], "*)p;\n"
105 " size_t i = get_global_id(0);\n"
106 " int k;\n"
107 " for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
108 " data[k] = i_p[i+k];\n"
109 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( 0, hdata_p );\n"
110 "}\n"
111 };
112
113 const char *source_privateV3[] = {
114 "__kernel void test( const __global half *p, __global float *f,"
115 " uint extra_last_thread )\n"
116 "{\n"
117 " __private ushort data[3];\n"
118 " __private half* hdata_p = (__private half*) data;\n"
119 " __global ushort* i_p = (__global ushort*)p;\n"
120 " size_t i = get_global_id(0);\n"
121 " int k;\n"
122 // " data = vload3(i, i_p);\n"
123 " size_t last_i = get_global_size(0)-1;\n"
124 " if(last_i == i && extra_last_thread != 0) {\n"
125 " if(extra_last_thread ==2) {\n"
126 " f[3*i+1] = vload_half(3*i+1, p);\n"
127 " }\n"
128 " f[3*i] = vload_half(3*i, p);\n"
129 " } else {\n"
130 " for (k=0; k<3; k++)\n"
131 " data[k] = i_p[i*3+k];\n"
132 " vstore3(vload_half3( 0, hdata_p ), i, f);\n"
133 " }\n"
134 "}\n"
135 };
136
137 const char *source_privateV3aligned[] = {
138 "__kernel void test( const __global half *p, __global float3 *f )\n"
139 "{\n"
140 " ushort4 data[4];\n" // declare as vector for alignment. Make four to check to see vloada_half3 index is working.
141 " half* hdata_p = (half*) &data;\n"
142 " size_t i = get_global_id(0);\n"
143 " global ushort* i_p = (global ushort*)p + i * 4;\n"
144 " int offset = i & 3;\n"
145 " data[offset] = (ushort4)( i_p[0], i_p[1], i_p[2], USHRT_MAX ); \n"
146 " data[offset^1] = USHRT_MAX; \n"
147 " data[offset^2] = USHRT_MAX; \n"
148 " data[offset^3] = USHRT_MAX; \n"
149 // test vloada_half3
150 " f[i] = vloada_half3( offset, hdata_p );\n"
151 // Fill in the 4th value so we don't have to special case this code elsewhere in the test.
152 " mem_fence(CLK_GLOBAL_MEM_FENCE );\n"
153 " ((__global float *)f)[4*i+3] = vload_half(4*i+3, p);\n"
154 "}\n"
155 };
156
157 char local_buf_size[10];
158
159 sprintf(local_buf_size, "%lld", (uint64_t)((effectiveVectorSize))*gWorkGroupSize);
160 const char *source_local1[] = {
161 "__kernel void test( const __global half *p, __global float *f )\n"
162 "{\n"
163 " __local ushort data[",local_buf_size,"];\n"
164 " __local half* hdata_p = (__local half*) data;\n"
165 " size_t i = get_global_id(0);\n"
166 " size_t lid = get_local_id(0);\n"
167 " data[lid] = ((__global ushort*)p)[i];\n"
168 " f[i] = vload", aligned ? "a" : "", "_half( lid, hdata_p );\n"
169 "}\n"
170 };
171
172 const char *source_local2[] = {
173 "#define VECTOR_LEN (",
174 vector_size_names[vectorSize],
175 "/",
176 align_divisors[vectorSize],
177 ")\n"
178 "#define ALIGN_TYPE ",
179 align_types[vectorSize],
180 "\n"
181 "__kernel void test( const __global half *p, __global float",
182 vector_size_name_extensions[vectorSize],
183 " *f )\n"
184 "{\n"
185 " __local uchar data[",
186 local_buf_size,
187 "/",
188 align_divisors[vectorSize],
189 "*sizeof(ALIGN_TYPE)] ",
190 "__attribute__((aligned(sizeof(ALIGN_TYPE))));\n"
191 " __local half* hdata_p = (__local half*) data;\n"
192 " __global ALIGN_TYPE* i_p = (__global ALIGN_TYPE*)p;\n"
193 " size_t i = get_global_id(0);\n"
194 " size_t lid = get_local_id(0);\n"
195 " int k;\n"
196 " for (k=0; k<VECTOR_LEN; k++)\n"
197 " *(__local ",
198 "ALIGN_TYPE*)&(data[(lid*VECTOR_LEN+k)*sizeof(ALIGN_TYPE)]) = ",
199 "i_p[i*VECTOR_LEN+k];\n"
200 " f[i] = vload",
201 aligned ? "a" : "",
202 "_half",
203 vector_size_name_extensions[vectorSize],
204 "( lid, hdata_p );\n"
205 "}\n"
206 };
207
208 const char *source_localV3[] = {
209 "__kernel void test( const __global half *p, __global float *f,\n"
210 " uint extra_last_thread)\n"
211 "{\n"
212 " __local ushort data[", local_buf_size,"];\n"
213 " __local half* hdata_p = (__local half*) data;\n"
214 " __global ushort* i_p = (__global ushort*)p;\n"
215 " size_t i = get_global_id(0);\n"
216 " size_t last_i = get_global_size(0)-1;\n"
217 " size_t lid = get_local_id(0);\n"
218 " int k;\n"
219 " if(last_i == i && extra_last_thread != 0) {\n"
220 " if(extra_last_thread ==2) {\n"
221 " f[3*i+1] = vload_half(3*i+1, p);\n"
222 " }\n"
223 " f[3*i] = vload_half(3*i, p);\n"
224 " } else {\n"
225 " for (k=0; k<3; k++)\n"
226 " data[lid*3+k] = i_p[i*3+k];\n"
227 " vstore3( vload_half3( lid, hdata_p ),i,f);\n"
228 " };\n"
229 "}\n"
230 };
231
232 const char *source_localV3aligned[] = {
233 "__kernel void test( const __global half *p, __global float3 *f )\n"
234 "{\n"
235 " __local ushort data[", local_buf_size,"];\n"
236 " __local half* hdata_p = (__local half*) data;\n"
237 " __global ushort* i_p = (__global ushort*)p;\n"
238 " size_t i = get_global_id(0);\n"
239 " size_t lid = get_local_id(0);\n"
240 " int k;\n"
241 " for (k=0; k<4; k++)\n"
242 " data[lid*4+k] = i_p[i*4+k];\n"
243 " f[i] = vloada_half3( lid, hdata_p );\n"
244 " ((__global float *)f)[4*i+3] = vload_half(lid*4+3, hdata_p);\n"
245 "}\n"
246 };
247
248 const char *source_constant[] = {
249 "__kernel void test( __constant half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
250 "{\n"
251 " size_t i = get_global_id(0);\n"
252 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
253 "}\n"
254 };
255
256 const char *source_constantV3[] = {
257 "__kernel void test( __constant half *p, __global float *f,\n"
258 " uint extra_last_thread)\n"
259 "{\n"
260 " size_t i = get_global_id(0);\n"
261 " size_t last_i = get_global_size(0)-1;\n"
262 " if(last_i == i && extra_last_thread != 0) {\n"
263 " if(extra_last_thread ==2) {\n"
264 " f[3*i+1] = vload_half(3*i+1, p);\n"
265 " }\n"
266 " f[3*i] = vload_half(3*i, p);\n"
267 " } else {\n"
268 " vstore3(vload_half",vector_size_name_extensions[vectorSize],"( i, p ), i, f);\n"
269 " }\n"
270 "}\n"
271 };
272
273 const char *source_constantV3aligned[] = {
274 "__kernel void test( __constant half *p, __global float3 *f )\n"
275 "{\n"
276 " size_t i = get_global_id(0);\n"
277 " f[i] = vloada_half3( i, p );\n"
278 " ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
279 "}\n"
280 };
281
282
283 if(g_arrVecSizes[vectorSize] != 3) {
284 programs[vectorSize][AS_Global] = MakeProgram( device, source, sizeof( source) / sizeof( source[0]) );
285 if( NULL == programs[ vectorSize ][AS_Global] ) {
286 gFailCount++;
287 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
288 for ( q= 0; q < sizeof( source) / sizeof( source[0]); q++)
289 vlog_error("%s", source[q]);
290 return -1;
291 } else {
292 }
293 } else if(aligned) {
294 programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3aligned, sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]) );
295 if( NULL == programs[ vectorSize ][AS_Global] ) {
296 gFailCount++;
297 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
298 for ( q= 0; q < sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]); q++)
299 vlog_error("%s", sourceV3aligned[q]);
300 return -1;
301 } else {
302 }
303 } else {
304 programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
305 if( NULL == programs[ vectorSize ][AS_Global] ) {
306 gFailCount++;
307 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
308 for ( q= 0; q < sizeof( sourceV3) / sizeof( sourceV3[0]); q++)
309 vlog_error("%s", sourceV3[q]);
310 return -1;
311 }
312 }
313
314 kernels[ vectorSize ][AS_Global] = clCreateKernel( programs[ vectorSize ][AS_Global], "test", &error );
315 if( NULL == kernels[vectorSize][AS_Global] )
316 {
317 gFailCount++;
318 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
319 return -2;
320 }
321
322 const char** source_ptr;
323 uint32_t source_size;
324 if (vectorSize == 0) {
325 source_ptr = source_private1;
326 source_size = sizeof( source_private1) / sizeof( source_private1[0]);
327 } else if(g_arrVecSizes[vectorSize] == 3) {
328 if(aligned) {
329 source_ptr = source_privateV3aligned;
330 source_size = sizeof( source_privateV3aligned) / sizeof( source_privateV3aligned[0]);
331 } else {
332 source_ptr = source_privateV3;
333 source_size = sizeof( source_privateV3) / sizeof( source_privateV3[0]);
334 }
335 } else {
336 source_ptr = source_private2;
337 source_size = sizeof( source_private2) / sizeof( source_private2[0]);
338 }
339 programs[vectorSize][AS_Private] = MakeProgram( device, source_ptr, source_size );
340 if( NULL == programs[ vectorSize ][AS_Private] )
341 {
342 gFailCount++;
343 vlog_error( "\t\tFAILED -- Failed to create private program.\n" );
344 for ( q= 0; q < source_size; q++)
345 vlog_error("%s", source_ptr[q]);
346 return -1;
347 }
348
349 kernels[ vectorSize ][AS_Private] = clCreateKernel( programs[ vectorSize ][AS_Private], "test", &error );
350 if( NULL == kernels[vectorSize][AS_Private] )
351 {
352 gFailCount++;
353 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
354 return -2;
355 }
356
357 if (vectorSize == 0) {
358 source_ptr = source_local1;
359 source_size = sizeof( source_local1) / sizeof( source_local1[0]);
360 } else if(g_arrVecSizes[vectorSize] == 3) {
361 if(aligned) {
362 source_ptr = source_localV3aligned;
363 source_size = sizeof(source_localV3aligned)/sizeof(source_localV3aligned[0]);
364 } else {
365 source_ptr = source_localV3;
366 source_size = sizeof(source_localV3)/sizeof(source_localV3[0]);
367 }
368 } else {
369 source_ptr = source_local2;
370 source_size = sizeof( source_local2) / sizeof( source_local2[0]);
371 }
372 programs[vectorSize][AS_Local] = MakeProgram( device, source_ptr, source_size );
373 if( NULL == programs[ vectorSize ][AS_Local] )
374 {
375 gFailCount++;
376 vlog_error( "\t\tFAILED -- Failed to create local program.\n" );
377 for ( q= 0; q < source_size; q++)
378 vlog_error("%s", source_ptr[q]);
379 return -1;
380 }
381
382 kernels[ vectorSize ][AS_Local] = clCreateKernel( programs[ vectorSize ][AS_Local], "test", &error );
383 if( NULL == kernels[vectorSize][AS_Local] )
384 {
385 gFailCount++;
386 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
387 return -2;
388 }
389
390 if(g_arrVecSizes[vectorSize] == 3) {
391 if(aligned) {
392 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3aligned, sizeof(source_constantV3aligned) / sizeof( source_constantV3aligned[0]) );
393 if( NULL == programs[ vectorSize ][AS_Constant] )
394 {
395 gFailCount++;
396 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
397 for ( q= 0; q < sizeof( source_constantV3aligned) / sizeof( source_constantV3aligned[0]); q++)
398 vlog_error("%s", source_constantV3aligned[q]);
399 return -1;
400 }
401 } else {
402 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3, sizeof(source_constantV3) / sizeof( source_constantV3[0]) );
403 if( NULL == programs[ vectorSize ][AS_Constant] )
404 {
405 gFailCount++;
406 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
407 for ( q= 0; q < sizeof( source_constantV3) / sizeof( source_constantV3[0]); q++)
408 vlog_error("%s", source_constantV3[q]);
409 return -1;
410 }
411 }
412 } else {
413 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constant, sizeof(source_constant) / sizeof( source_constant[0]) );
414 if( NULL == programs[ vectorSize ][AS_Constant] )
415 {
416 gFailCount++;
417 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
418 for ( q= 0; q < sizeof( source_constant) / sizeof( source_constant[0]); q++)
419 vlog_error("%s", source_constant[q]);
420 return -1;
421 }
422 }
423
424 kernels[ vectorSize ][AS_Constant] = clCreateKernel( programs[ vectorSize ][AS_Constant], "test", &error );
425 if( NULL == kernels[vectorSize][AS_Constant] )
426 {
427 gFailCount++;
428 vlog_error( "\t\tFAILED -- Failed to create constant kernel. (%d)\n", error );
429 return -2;
430 }
431 }
432
433 // Figure out how many elements are in a work block
434 size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
435 size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
436 uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
437
438 // we handle 64-bit types a bit differently.
439 if( lastCase == 0 )
440 lastCase = 0x100000000ULL;
441
442
443 uint64_t i, j;
444 uint64_t printMask = (lastCase >> 4) - 1;
445 uint32_t count = 0;
446 error = 0;
447 int addressSpace;
448 // int reported_vector_skip = 0;
449
450 for( i = 0; i < (uint64_t)lastCase; i += blockCount )
451 {
452 count = (uint32_t) MIN( blockCount, lastCase - i );
453
454 //Init the input stream
455 uint16_t *p = (uint16_t *)gIn_half;
456 for( j = 0; j < count; j++ )
457 p[j] = j + i;
458
459 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)))
460 {
461 vlog_error( "Failure in clWriteArray\n" );
462 gFailCount++;
463 goto exit;
464 }
465
466 //create the reference result
467 const unsigned short *s = (const unsigned short *)gIn_half;
468 float *d = (float *)gOut_single_reference;
469 for (j = 0; j < count; j++) d[j] = cl_half_to_float(s[j]);
470
471 //Check the vector lengths
472 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
473 { // here we loop through vector sizes, 3 is last
474
475 for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
476 uint32_t pattern = 0x7fffdead;
477
478 /*
479 if (addressSpace == 3) {
480 vlog("Note: skipping address space %s due to small buffer size.\n", addressSpaceNames[addressSpace]);
481 continue;
482 }
483 */
484 memset_pattern4( gOut_single, &pattern, getBufferSize(device));
485 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
486 {
487 vlog_error( "Failure in clWriteArray\n" );
488 gFailCount++;
489 goto exit;
490 }
491
492 if(g_arrVecSizes[vectorSize] == 3 && !aligned) {
493 // now we need to add the extra const argument for how
494 // many elements the last thread should take care of.
495 }
496
497 // okay, here is where we have to be careful
498 if( (error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
499 runsOverBy(count, vectorSize, aligned) ) ) )
500 {
501 gFailCount++;
502 goto exit;
503 }
504
505 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
506 {
507 vlog_error( "Failure in clReadArray\n" );
508 gFailCount++;
509 goto exit;
510 }
511
512 if( memcmp( gOut_single, gOut_single_reference, count * sizeof( float )) )
513 {
514 uint32_t *u1 = (uint32_t *)gOut_single;
515 uint32_t *u2 = (uint32_t *)gOut_single_reference;
516 float *f1 = (float *)gOut_single;
517 float *f2 = (float *)gOut_single_reference;
518 for( j = 0; j < count; j++ )
519 {
520 if(isnan(f1[j]) && isnan(f2[j])) // both are nan dont compare them
521 continue;
522 if( u1[j] != u2[j])
523 {
524 vlog_error( " %lld) (of %lld) Failure at 0x%4.4x: %a vs *%a (0x%8.8x vs *0x%8.8x) vector_size = %d (%s) address space = %s, load is %s\n",
525 j, (uint64_t)count, ((unsigned short*)gIn_half)[j], f1[j], f2[j], u1[j], u2[j], (g_arrVecSizes[vectorSize]),
526 vector_size_names[vectorSize], addressSpaceNames[addressSpace],
527 (aligned?"aligned":"unaligned"));
528 gFailCount++;
529 error = -1;
530 goto exit;
531 }
532 }
533 }
534
535 if( gReportTimes && addressSpace == 0)
536 {
537 //Run again for timing
538 for( j = 0; j < 100; j++ )
539 {
540 uint64_t startTime = ReadTime();
541 error =
542 RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
543 runsOverBy(count, vectorSize, aligned));
544 if(error)
545 {
546 gFailCount++;
547 goto exit;
548 }
549
550 if( (error = clFinish(gQueue)) )
551 {
552 vlog_error( "Failure in clFinish\n" );
553 gFailCount++;
554 goto exit;
555 }
556 uint64_t currentTime = ReadTime() - startTime;
557 time[ vectorSize ] += currentTime;
558 if( currentTime < min_time[ vectorSize ] )
559 min_time[ vectorSize ] = currentTime ;
560 }
561 }
562 }
563 }
564
565 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
566 {
567 vlog( "." );
568 fflush( stdout );
569 }
570 }
571
572 vlog( "\n" );
573
574 if( gReportTimes )
575 {
576 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
577 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * 100), 0,
578 "average us/elem", "vLoad%sHalf avg. (%s, vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
579 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
580 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
581 "best us/elem", "vLoad%sHalf best (%s vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
582 }
583
584 exit:
585 //clean up
586 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
587 {
588 for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
589 clReleaseKernel( kernels[ vectorSize ][addressSpace] );
590 clReleaseProgram( programs[ vectorSize ][addressSpace] );
591 }
592 }
593
594 return error;
595 }
596
test_vload_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)597 int test_vload_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
598 {
599 return Test_vLoadHalf_private( device, false );
600 }
601
test_vloada_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)602 int test_vloada_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
603 {
604 return Test_vLoadHalf_private( device, true );
605 }
606
607