1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19 
20 #include <string.h>
21 #include "cl_utils.h"
22 #include "tests.h"
23 
24 #include <CL/cl_half.h>
25 
26 typedef struct ComputeReferenceInfoF_
27 {
28     float *x;
29     cl_ushort *r;
30     f2h f;
31     cl_ulong i;
32     cl_uint lim;
33     cl_uint count;
34 } ComputeReferenceInfoF;
35 
36 typedef struct ComputeReferenceInfoD_
37 {
38     double *x;
39     cl_ushort *r;
40     d2h f;
41     cl_ulong i;
42     cl_uint lim;
43     cl_uint count;
44 } ComputeReferenceInfoD;
45 
46 typedef struct CheckResultInfoF_
47 {
48     const float *x;
49     const cl_ushort *r;
50     const cl_ushort *s;
51     f2h f;
52     const char *aspace;
53     cl_uint lim;
54     cl_uint count;
55     int vsz;
56 } CheckResultInfoF;
57 
58 typedef struct CheckResultInfoD_
59 {
60     const double *x;
61     const cl_ushort *r;
62     const cl_ushort *s;
63     d2h f;
64     const char *aspace;
65     cl_uint lim;
66     cl_uint count;
67     int vsz;
68 } CheckResultInfoD;
69 
70 static cl_int
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)71 ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
72 {
73     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
74     cl_uint lim = cri->lim;
75     cl_uint count = cri->count;
76     cl_uint off = jid * count;
77     float *x = cri->x + off;
78     cl_ushort *r = cri->r + off;
79     f2h f = cri->f;
80     cl_ulong i = cri->i + off;
81     cl_uint j, rr;
82 
83     if (off + count > lim)
84         count = lim - off;
85 
86     for (j = 0; j < count; ++j) {
87         x[j] = as_float((cl_uint)(i + j));
88         r[j] = f(x[j]);
89     }
90 
91     return 0;
92 }
93 
94 static cl_int
CheckF(cl_uint jid,cl_uint tid,void * userInfo)95 CheckF(cl_uint jid, cl_uint tid, void *userInfo)
96 {
97     CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
98     cl_uint lim = cri->lim;
99     cl_uint count = cri->count;
100     cl_uint off = jid * count;
101     const float *x = cri->x + off;
102     const cl_ushort *r = cri->r + off;
103     const cl_ushort *s = cri->s + off;
104     f2h f = cri->f;
105     cl_uint j;
106     cl_ushort correct2 = f( 0.0f);
107     cl_ushort correct3 = f(-0.0f);
108     cl_int ret = 0;
109 
110     if (off + count > lim)
111         count = lim - off;
112 
113     if (!memcmp(r, s, count*sizeof(cl_ushort)))
114         return 0;
115 
116     for (j = 0; j < count; j++) {
117     if (s[j] == r[j])
118         continue;
119 
120         // Pass any NaNs
121         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
122             continue;
123 
124         // retry per section 6.5.3.3
125         if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
126             continue;
127 
128         // if reference result is subnormal, pass any zero
129         if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
130             continue;
131 
132         vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  vector_size = %d, address_space = %s\n",
133                    j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
134 
135         ret = 1;
136         break;
137     }
138 
139     return ret;
140 }
141 
142 static cl_int
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)143 ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
144 {
145     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
146     cl_uint lim = cri->lim;
147     cl_uint count = cri->count;
148     cl_uint off = jid * count;
149     double *x = cri->x + off;
150     cl_ushort *r = cri->r + off;
151     d2h f = cri->f;
152     cl_uint j;
153     cl_ulong i = cri->i + off;
154 
155     if (off + count > lim)
156         count = lim - off;
157 
158     for (j = 0; j < count; ++j) {
159         x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
160         r[j] = f(x[j]);
161     }
162 
163     return 0;
164 }
165 
166 static cl_int
CheckD(cl_uint jid,cl_uint tid,void * userInfo)167 CheckD(cl_uint jid, cl_uint tid, void *userInfo)
168 {
169     CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
170     cl_uint lim = cri->lim;
171     cl_uint count = cri->count;
172     cl_uint off = jid * count;
173     const double *x = cri->x + off;
174     const cl_ushort *r = cri->r + off;
175     const cl_ushort *s = cri->s + off;
176     d2h f = cri->f;
177     cl_uint j;
178     cl_ushort correct2 = f( 0.0);
179     cl_ushort correct3 = f(-0.0);
180     cl_int ret = 0;
181 
182     if (off + count > lim)
183         count = lim - off;
184 
185     if (!memcmp(r, s, count*sizeof(cl_ushort)))
186         return 0;
187 
188     for (j = 0; j < count; j++) {
189     if (s[j] == r[j])
190         continue;
191 
192         // Pass any NaNs
193         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
194             continue;
195 
196         if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
197             continue;
198 
199         // if reference result is subnormal, pass any zero result
200         if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
201             continue;
202 
203         vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
204                    j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
205 
206         ret = 1;
207     break;
208     }
209 
210     return ret;
211 }
212 
float2half_rte(float f)213 static cl_half float2half_rte(float f)
214 {
215     return cl_half_from_float(f, CL_HALF_RTE);
216 }
217 
float2half_rtz(float f)218 static cl_half float2half_rtz(float f)
219 {
220     return cl_half_from_float(f, CL_HALF_RTZ);
221 }
222 
float2half_rtp(float f)223 static cl_half float2half_rtp(float f)
224 {
225     return cl_half_from_float(f, CL_HALF_RTP);
226 }
227 
float2half_rtn(float f)228 static cl_half float2half_rtn(float f)
229 {
230     return cl_half_from_float(f, CL_HALF_RTN);
231 }
232 
double2half_rte(double f)233 static cl_half double2half_rte(double f)
234 {
235     return cl_half_from_double(f, CL_HALF_RTE);
236 }
237 
double2half_rtz(double f)238 static cl_half double2half_rtz(double f)
239 {
240     return cl_half_from_double(f, CL_HALF_RTZ);
241 }
242 
double2half_rtp(double f)243 static cl_half double2half_rtp(double f)
244 {
245     return cl_half_from_double(f, CL_HALF_RTP);
246 }
247 
double2half_rtn(double f)248 static cl_half double2half_rtn(double f)
249 {
250     return cl_half_from_double(f, CL_HALF_RTN);
251 }
252 
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)253 int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
254 {
255     switch (get_default_rounding_mode(deviceID))
256     {
257         case CL_FP_ROUND_TO_ZERO:
258             return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, "");
259         case 0:
260             return -1;
261         default:
262             return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "");
263     }
264 }
265 
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)266 int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
267 {
268     return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
269 }
270 
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)271 int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
272 {
273     return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
274 }
275 
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)276 int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
277 {
278     return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
279 }
280 
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)281 int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
282 {
283     return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
284 }
285 
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
287 {
288     switch (get_default_rounding_mode(deviceID))
289     {
290         case CL_FP_ROUND_TO_ZERO:
291             return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, "");
292         case 0:
293             return -1;
294         default:
295             return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "");
296     }
297 }
298 
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)299 int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
300 {
301     return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
302 }
303 
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)304 int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
305 {
306     return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
307 }
308 
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)309 int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
310 {
311     return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
312 }
313 
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)314 int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
315 {
316     return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
317 }
318 
319 #pragma mark -
320 
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)321 int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
322 {
323     int vectorSize, error;
324     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
325     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
326 
327     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
328     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
329     memset( min_time, -1, sizeof( min_time ) );
330     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
331     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
332     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
333     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
334     memset( min_double_time, -1, sizeof( min_double_time ) );
335 
336     bool aligned= false;
337 
338     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
339     {
340         const char *source[] = {
341             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
342             "{\n"
343             "   size_t i = get_global_id(0);\n"
344             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
345             "}\n"
346         };
347 
348         const char *source_v3[] = {
349             "__kernel void test( __global float *p, __global half *f,\n"
350             "                   uint extra_last_thread)\n"
351             "{\n"
352             "   size_t i = get_global_id(0);\n"
353             "   size_t last_i = get_global_size(0)-1;\n"
354             "   size_t adjust = 0;\n"
355             "   if(last_i == i && extra_last_thread != 0) {\n"
356             "     adjust = 3-extra_last_thread;\n"
357             "   } "
358             "   vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
359             "}\n"
360         };
361 
362         const char *source_private_store[] = {
363             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
364             "{\n"
365             "   __private ushort data[16];\n"
366             "   size_t i = get_global_id(0);\n"
367             "   size_t offset = 0;\n"
368             "   size_t vecsize = vec_step(p[i]);\n"
369             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
370             "   for(offset = 0; offset < vecsize; offset++)\n"
371             "   {\n"
372             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
373             "   }\n"
374             "}\n"
375         };
376 
377 
378         const char *source_private_store_v3[] = {
379             "__kernel void test( __global float *p, __global half *f,\n"
380             "                   uint extra_last_thread )\n"
381             "{\n"
382             "   __private ushort data[4];\n"
383             "   size_t i = get_global_id(0);\n"
384             "   size_t last_i = get_global_size(0)-1;\n"
385             "   size_t adjust = 0;\n"
386             "   size_t offset = 0;\n"
387             "   if(last_i == i && extra_last_thread != 0) {\n"
388             "     adjust = 3-extra_last_thread;\n"
389             "   } "
390             "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
391             "   for(offset = 0; offset < 3; offset++)\n"
392             "   {\n"
393             "       vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
394             "   }\n"
395             "}\n"
396         };
397 
398         char local_buf_size[10];
399         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
400 
401 
402         const char *source_local_store[] = {
403             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
404             "{\n"
405             "   __local ushort data[16*", local_buf_size, "];\n"
406             "   size_t i = get_global_id(0);\n"
407             "   size_t lid = get_local_id(0);\n"
408             "   size_t lsize = get_local_size(0);\n"
409             "   size_t vecsize = vec_step(p[0]);\n"
410             "   event_t async_event;\n"
411             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
412             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
413             "   async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
414             "   wait_group_events(1, &async_event);\n"
415             "}\n"
416         };
417 
418         const char *source_local_store_v3[] = {
419             "__kernel void test( __global float *p, __global half *f,\n"
420             "                   uint extra_last_thread )\n"
421             "{\n"
422             "   __local ushort data[3*(", local_buf_size, "+1)];\n"
423             "   size_t i = get_global_id(0);\n"
424             "   size_t lid = get_local_id(0);\n"
425             "   size_t last_i = get_global_size(0)-1;\n"
426             "   size_t adjust = 0;\n"
427             "   size_t lsize = get_local_size(0);\n"
428             "   event_t async_event;\n"
429             "   if(last_i == i && extra_last_thread != 0) {\n"
430             "     adjust = 3-extra_last_thread;\n"
431             "   } "
432             "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
433             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
434             "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
435             "   wait_group_events(1, &async_event);\n"
436             "}\n"
437         };
438 
439         const char *double_source[] = {
440             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
441             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
442             "{\n"
443             "   size_t i = get_global_id(0);\n"
444             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
445             "}\n"
446         };
447 
448         const char *double_source_private_store[] = {
449             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
450             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
451             "{\n"
452             "   __private ushort data[16];\n"
453             "   size_t i = get_global_id(0);\n"
454             "   size_t offset = 0;\n"
455             "   size_t vecsize = vec_step(p[i]);\n"
456             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
457             "   for(offset = 0; offset < vecsize; offset++)\n"
458             "   {\n"
459             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
460             "   }\n"
461             "}\n"
462         };
463 
464 
465         const char *double_source_local_store[] = {
466             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
467             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
468             "{\n"
469             "   __local ushort data[16*", local_buf_size, "];\n"
470             "   size_t i = get_global_id(0);\n"
471             "   size_t lid = get_local_id(0);\n"
472             "   size_t vecsize = vec_step(p[0]);\n"
473             "   size_t lsize = get_local_size(0);\n"
474             "   event_t async_event;\n"
475             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
476             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
477             "   async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
478             "   wait_group_events(1, &async_event);\n"
479             "}\n"
480         };
481 
482 
483         const char *double_source_v3[] = {
484             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
485             "__kernel void test( __global double *p, __global half *f ,\n"
486             "                   uint extra_last_thread)\n"
487             "{\n"
488             "   size_t i = get_global_id(0);\n"
489             "   size_t last_i = get_global_size(0)-1;\n"
490             "   size_t adjust = 0;\n"
491             "   if(last_i == i && extra_last_thread != 0) {\n"
492             "     adjust = 3-extra_last_thread;\n"
493             "   } "
494             "   vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
495             "}\n"
496         };
497 
498         const char *double_source_private_store_v3[] = {
499             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
500             "__kernel void test( __global double *p, __global half *f,\n"
501             "                   uint extra_last_thread )\n"
502             "{\n"
503             "   __private ushort data[4];\n"
504             "   size_t i = get_global_id(0);\n"
505             "   size_t last_i = get_global_size(0)-1;\n"
506             "   size_t adjust = 0;\n"
507             "   size_t offset = 0;\n"
508             "   if(last_i == i && extra_last_thread != 0) {\n"
509             "     adjust = 3-extra_last_thread;\n"
510             "   } "
511             "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
512             "   for(offset = 0; offset < 3; offset++)\n"
513             "   {\n"
514             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
515             "   }\n"
516             "}\n"
517         };
518 
519         const char *double_source_local_store_v3[] = {
520             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
521             "__kernel void test( __global double *p, __global half *f,\n"
522             "                   uint extra_last_thread )\n"
523             "{\n"
524             "   __local ushort data[3*(", local_buf_size, "+1)];\n"
525             "   size_t i = get_global_id(0);\n"
526             "   size_t lid = get_local_id(0);\n"
527             "   size_t last_i = get_global_size(0)-1;\n"
528             "   size_t adjust = 0;\n"
529             "   size_t lsize = get_local_size(0);\n"
530             "   event_t async_event;\n"
531             "   if(last_i == i && extra_last_thread != 0) {\n"
532             "     adjust = 3-extra_last_thread;\n"
533             "   }\n "
534             "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
535             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
536             "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
537             "   wait_group_events(1, &async_event);\n"
538             "}\n"
539         };
540 
541 
542 
543         if(g_arrVecSizes[vectorSize] == 3) {
544             programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
545         } else {
546             programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
547         }
548         if( NULL == programs[ vectorSize ][0] )
549         {
550             gFailCount++;
551             return -1;
552         }
553 
554         kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
555         if( NULL == kernels[vectorSize][0] )
556         {
557             gFailCount++;
558             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
559             return error;
560         }
561 
562         if(g_arrVecSizes[vectorSize] == 3) {
563             programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
564         } else {
565             programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
566         }
567         if( NULL == programs[ vectorSize ][1] )
568         {
569             gFailCount++;
570             return -1;
571         }
572 
573         kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
574         if( NULL == kernels[vectorSize][1] )
575         {
576             gFailCount++;
577             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
578             return error;
579         }
580 
581         if(g_arrVecSizes[vectorSize] == 3) {
582             programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
583             if(  NULL == programs[ vectorSize ][2] )
584             {
585                 unsigned q;
586                 for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
587                     vlog_error("%s", source_local_store_v3[q]);
588 
589                 gFailCount++;
590                 return -1;
591 
592             }
593         } else {
594             programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
595             if( NULL == programs[ vectorSize ][2] )
596             {
597                 unsigned q;
598                 for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
599                     vlog_error("%s", source_local_store[q]);
600 
601                 gFailCount++;
602                 return -1;
603 
604             }
605         }
606 
607         kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
608         if( NULL == kernels[vectorSize][2] )
609         {
610             gFailCount++;
611             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
612             return error;
613         }
614 
615         if( gTestDouble )
616         {
617             if(g_arrVecSizes[vectorSize] == 3) {
618                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
619             } else {
620                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
621             }
622             if( NULL == doublePrograms[ vectorSize ][0] )
623             {
624                 gFailCount++;
625                 return -1;
626             }
627 
628             doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
629             if( NULL == kernels[vectorSize][0] )
630             {
631                 gFailCount++;
632                 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
633                 return error;
634             }
635 
636             if(g_arrVecSizes[vectorSize] == 3)
637                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
638             else
639                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
640 
641             if( NULL == doublePrograms[ vectorSize ][1] )
642             {
643                 gFailCount++;
644                 return -1;
645             }
646 
647             doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
648             if( NULL == kernels[vectorSize][1] )
649             {
650                 gFailCount++;
651                 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
652                 return error;
653             }
654 
655             if(g_arrVecSizes[vectorSize] == 3) {
656                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
657             } else {
658                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
659             }
660             if( NULL == doublePrograms[ vectorSize ][2] )
661             {
662                 gFailCount++;
663                 return -1;
664             }
665 
666             doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
667             if( NULL == kernels[vectorSize][2] )
668             {
669                 gFailCount++;
670                 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
671                 return error;
672             }
673         }
674     } // end for vector size
675 
676     // Figure out how many elements are in a work block
677     size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
678     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
679     uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
680     size_t stride = blockCount;
681 
682     if (gWimpyMode)
683         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
684 
685     // we handle 64-bit types a bit differently.
686     if( lastCase == 0 )
687         lastCase = 0x100000000ULL;
688 
689     uint64_t i, j;
690     error = 0;
691     uint64_t printMask = (lastCase >> 4) - 1;
692     cl_uint count = 0;
693     int addressSpace;
694     size_t loopCount;
695     cl_uint threadCount = GetThreadCount();
696 
697     ComputeReferenceInfoF fref;
698     fref.x = (float *)gIn_single;
699     fref.r = (cl_half *)gOut_half_reference;
700     fref.f = referenceFunc;
701     fref.lim = blockCount;
702     fref.count = (blockCount + threadCount - 1) / threadCount;
703 
704     CheckResultInfoF fchk;
705     fchk.x = (const float *)gIn_single;
706     fchk.r = (const cl_half *)gOut_half_reference;
707     fchk.s = (const cl_half *)gOut_half;
708     fchk.f = referenceFunc;
709     fchk.lim = blockCount;
710     fchk.count = (blockCount + threadCount - 1) / threadCount;
711 
712     ComputeReferenceInfoD dref;
713     dref.x = (double *)gIn_double;
714     dref.r = (cl_half *)gOut_half_reference_double;
715     dref.f = doubleReferenceFunc;
716     dref.lim = blockCount;
717     dref.count = (blockCount + threadCount - 1) / threadCount;
718 
719     CheckResultInfoD dchk;
720     dchk.x = (const double *)gIn_double;
721     dchk.r = (const cl_half *)gOut_half_reference_double;
722     dchk.s = (const cl_half *)gOut_half;
723     dchk.f = doubleReferenceFunc;
724     dchk.lim = blockCount;
725     dchk.count = (blockCount + threadCount - 1) / threadCount;
726 
727     for( i = 0; i < lastCase; i += stride )
728     {
729         count = (cl_uint) MIN( blockCount, lastCase - i );
730         fref.i = i;
731         dref.i = i;
732 
733         // Compute the input and reference
734         ThreadPool_Do(ReferenceF, threadCount, &fref);
735 
736         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
737         if (error) {
738             vlog_error( "Failure in clWriteBuffer\n" );
739             gFailCount++;
740             goto exit;
741         }
742 
743         if (gTestDouble) {
744             ThreadPool_Do(ReferenceD, threadCount, &dref);
745 
746             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
747             if (error) {
748                 vlog_error( "Failure in clWriteBuffer\n" );
749                 gFailCount++;
750                 goto exit;
751             }
752         }
753 
754         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
755             // Loop through vector sizes
756             fchk.vsz = g_arrVecSizes[vectorSize];
757             dchk.vsz = g_arrVecSizes[vectorSize];
758 
759             for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
760                 // Loop over address spaces
761                 fchk.aspace = addressSpaceNames[addressSpace];
762                 dchk.aspace = addressSpaceNames[addressSpace];
763 
764                 cl_uint pattern = 0xdeaddead;
765                 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
766 
767                 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
768                                              0, count * sizeof(cl_half),
769                                              gOut_half, 0, NULL, NULL);
770                 if (error) {
771                     vlog_error( "Failure in clWriteArray\n" );
772                     gFailCount++;
773                     goto exit;
774                 }
775 
776                 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
777                                        numVecs(count, vectorSize, aligned) ,
778                                   runsOverBy(count, vectorSize, aligned));
779                 if (error) {
780                     gFailCount++;
781                     goto exit;
782                 }
783 
784                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
785                                             count * sizeof(cl_half), gOut_half,
786                                             0, NULL, NULL);
787                 if (error) {
788                     vlog_error( "Failure in clReadArray\n" );
789                     gFailCount++;
790                     goto exit;
791                 }
792 
793                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
794                 if (error) {
795                             gFailCount++;
796                             goto exit;
797                         }
798 
799                 if (gTestDouble) {
800                     memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
801 
802                     error = clEnqueueWriteBuffer(
803                         gQueue, gOutBuffer_half, CL_FALSE, 0,
804                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
805                     if (error) {
806                         vlog_error( "Failure in clWriteArray\n" );
807                         gFailCount++;
808                         goto exit;
809                     }
810 
811                     error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
812                                       numVecs(count, vectorSize, aligned),
813                                       runsOverBy(count, vectorSize, aligned));
814                     if (error) {
815                         gFailCount++;
816                         goto exit;
817                     }
818 
819                     error = clEnqueueReadBuffer(
820                         gQueue, gOutBuffer_half, CL_TRUE, 0,
821                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
822                     if (error) {
823                         vlog_error( "Failure in clReadArray\n" );
824                         gFailCount++;
825                         goto exit;
826                     }
827 
828                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
829                     if (error) {
830                                 gFailCount++;
831                                 goto exit;
832                             }
833                         }
834                     }
835                 }
836 
837         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
838         {
839             vlog( "." );
840             fflush( stdout );
841         }
842     }  // end last case
843 
844     loopCount = count == blockCount ? 1 : 100;
845     if( gReportTimes )
846     {
847         //Init the input stream
848         cl_float *p = (cl_float *)gIn_single;
849         for( j = 0; j < count; j++ )
850             p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
851 
852         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
853         {
854             vlog_error( "Failure in clWriteArray\n" );
855             gFailCount++;
856             goto exit;
857         }
858 
859         if( gTestDouble )
860         {
861             //Init the input stream
862             cl_double *q = (cl_double *)gIn_double;
863             for( j = 0; j < count; j++ )
864                 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
865 
866             if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
867             {
868                 vlog_error( "Failure in clWriteArray\n" );
869                 gFailCount++;
870                 goto exit;
871             }
872         }
873 
874         //Run again for timing
875         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
876         {
877             uint64_t bestTime = -1ULL;
878             for( j = 0; j < loopCount; j++ )
879             {
880                 uint64_t startTime = ReadTime();
881 
882 
883                 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
884                                        runsOverBy(count, vectorSize, aligned)) ) )
885                 {
886                     gFailCount++;
887                     goto exit;
888                 }
889 
890                 if( (error = clFinish(gQueue)) )
891                 {
892                     vlog_error( "Failure in clFinish\n" );
893                     gFailCount++;
894                     goto exit;
895                 }
896                 uint64_t currentTime = ReadTime() - startTime;
897                 if( currentTime < bestTime )
898                     bestTime = currentTime;
899                 time[ vectorSize ] += currentTime;
900             }
901             if( bestTime < min_time[ vectorSize ] )
902                 min_time[ vectorSize ] = bestTime ;
903 
904             if( gTestDouble )
905             {
906                 bestTime = -1ULL;
907                 for( j = 0; j < loopCount; j++ )
908                 {
909                     uint64_t startTime = ReadTime();
910                     if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
911                                            runsOverBy(count, vectorSize, aligned)) ) )
912                     {
913                         gFailCount++;
914                         goto exit;
915                     }
916 
917                     if( (error = clFinish(gQueue)) )
918                     {
919                         vlog_error( "Failure in clFinish\n" );
920                         gFailCount++;
921                         goto exit;
922                     }
923                     uint64_t currentTime = ReadTime() - startTime;
924                     if( currentTime < bestTime )
925                         bestTime = currentTime;
926                     doubleTime[ vectorSize ] += currentTime;
927                 }
928                 if( bestTime < min_double_time[ vectorSize ] )
929                     min_double_time[ vectorSize ] = bestTime;
930             }
931         }
932     }
933 
934     if( gReportTimes )
935     {
936         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
937             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
938                       "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
939         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
940             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
941                       "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
942         if( gTestDouble )
943         {
944             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
945                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
946                           "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0],  (g_arrVecSizes[vectorSize])  );
947             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
948                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
949                           "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
950         }
951     }
952 
953 exit:
954     //clean up
955     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
956     {
957         for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
958             clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
959             clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
960             if( gTestDouble )
961             {
962                 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
963                 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
964             }
965         }
966     }
967 
968     return error;
969 }
970 
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)971 int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
972 {
973     int vectorSize, error;
974     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
975     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
976 
977     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
978     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
979     memset( min_time, -1, sizeof( min_time ) );
980     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
981     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
982     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
983     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
984     memset( min_double_time, -1, sizeof( min_double_time ) );
985 
986     bool aligned = true;
987 
988     int minVectorSize = kMinVectorSize;
989     // There is no aligned scalar vstorea_half
990     if( 0 == minVectorSize )
991         minVectorSize = 1;
992 
993     //Loop over vector sizes
994     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
995     {
996         const char *source[] = {
997             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
998             "{\n"
999             "   size_t i = get_global_id(0);\n"
1000             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1001             "}\n"
1002         };
1003 
1004         const char *source_v3[] = {
1005             "__kernel void test( __global float3 *p, __global half *f )\n"
1006             "{\n"
1007             "   size_t i = get_global_id(0);\n"
1008             "   vstorea_half3",roundName,"( p[i], i, f );\n"
1009             "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1010             "}\n"
1011         };
1012 
1013         const char *source_private[] = {
1014             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1015             "{\n"
1016             "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
1017             "   size_t i = get_global_id(0);\n"
1018             "   data = p[i];\n"
1019             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1020             "}\n"
1021         };
1022 
1023         const char *source_private_v3[] = {
1024             "__kernel void test( __global float3 *p, __global half *f )\n"
1025             "{\n"
1026             "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
1027             "   size_t i = get_global_id(0);\n"
1028             "   data = p[i];\n"
1029             "   vstorea_half3",roundName,"( data, i, f );\n"
1030             "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1031             "}\n"
1032         };
1033 
1034         char local_buf_size[10];
1035         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1036         const char *source_local[] = {
1037             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1038             "{\n"
1039             "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1040             "   size_t i = get_global_id(0);\n"
1041             "   size_t lid = get_local_id(0);\n"
1042             "   data[lid] = p[i];\n"
1043             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1044             "}\n"
1045         };
1046 
1047         const char *source_local_v3[] = {
1048             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1049             "{\n"
1050             "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1051             "   size_t i = get_global_id(0);\n"
1052             "   size_t lid = get_local_id(0);\n"
1053             "   data[lid] = p[i];\n"
1054             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1055             "   vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1056             "}\n"
1057         };
1058 
1059         const char *double_source[] = {
1060             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1061             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1062             "{\n"
1063             "   size_t i = get_global_id(0);\n"
1064             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1065             "}\n"
1066         };
1067 
1068         const char *double_source_v3[] = {
1069             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1070             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1071             "{\n"
1072             "   size_t i = get_global_id(0);\n"
1073             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1074             "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1075             "}\n"
1076         };
1077 
1078         const char *double_source_private[] = {
1079             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1080             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1081             "{\n"
1082             "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
1083             "   size_t i = get_global_id(0);\n"
1084             "   data = p[i];\n"
1085             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1086             "}\n"
1087         };
1088 
1089         const char *double_source_private_v3[] = {
1090             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1091             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1092             "{\n"
1093             "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
1094             "   size_t i = get_global_id(0);\n"
1095             "   data = p[i];\n"
1096             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1097             "   vstore_half",roundName,"( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
1098             "}\n"
1099         };
1100 
1101         const char *double_source_local[] = {
1102             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1103             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1104             "{\n"
1105             "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1106             "   size_t i = get_global_id(0);\n"
1107             "   size_t lid = get_local_id(0);\n"
1108             "   data[lid] = p[i];\n"
1109             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1110             "}\n"
1111         };
1112 
1113         const char *double_source_local_v3[] = {
1114             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1115             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1116             "{\n"
1117             "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1118             "   size_t i = get_global_id(0);\n"
1119             "   size_t lid = get_local_id(0);\n"
1120             "   data[lid] = p[i];\n"
1121             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1122             "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1123             "}\n"
1124         };
1125 
1126         if(g_arrVecSizes[vectorSize] == 3) {
1127             programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
1128             if( NULL == programs[ vectorSize ][0] )
1129             {
1130                 gFailCount++;
1131                 return -1;
1132             }
1133         } else {
1134             programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
1135             if( NULL == programs[ vectorSize ][0] )
1136             {
1137                 gFailCount++;
1138                 return -1;
1139             }
1140         }
1141 
1142         kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
1143         if( NULL == kernels[vectorSize][0] )
1144         {
1145             gFailCount++;
1146             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
1147             return error;
1148         }
1149 
1150         if(g_arrVecSizes[vectorSize] == 3) {
1151             programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
1152             if( NULL == programs[ vectorSize ][1] )
1153             {
1154                 gFailCount++;
1155                 return -1;
1156             }
1157         } else {
1158             programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) );
1159             if( NULL == programs[ vectorSize ][1] )
1160             {
1161                 gFailCount++;
1162                 return -1;
1163             }
1164         }
1165 
1166         kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
1167         if( NULL == kernels[vectorSize][1] )
1168         {
1169             gFailCount++;
1170             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
1171             return error;
1172         }
1173 
1174         if(g_arrVecSizes[vectorSize] == 3) {
1175             programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
1176             if( NULL == programs[ vectorSize ][2] )
1177             {
1178                 gFailCount++;
1179                 return -1;
1180             }
1181         } else {
1182             programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) );
1183             if( NULL == programs[ vectorSize ][2] )
1184             {
1185                 gFailCount++;
1186                 return -1;
1187             }
1188         }
1189 
1190         kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
1191         if( NULL == kernels[vectorSize][2] )
1192         {
1193             gFailCount++;
1194             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
1195             return error;
1196         }
1197 
1198         if( gTestDouble )
1199         {
1200             if(g_arrVecSizes[vectorSize] == 3) {
1201                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
1202                 if( NULL == doublePrograms[ vectorSize ][0] )
1203                 {
1204                     gFailCount++;
1205                     return -1;
1206                 }
1207             } else {
1208                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
1209                 if( NULL == doublePrograms[ vectorSize ][0] )
1210                 {
1211                     gFailCount++;
1212                     return -1;
1213                 }
1214             }
1215 
1216             doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
1217             if( NULL == kernels[vectorSize][0] )
1218             {
1219                 gFailCount++;
1220                 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
1221                 return error;
1222             }
1223 
1224             if(g_arrVecSizes[vectorSize] == 3) {
1225                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
1226                 if( NULL == doublePrograms[ vectorSize ][1] )
1227                 {
1228                     gFailCount++;
1229                     return -1;
1230                 }
1231             } else {
1232                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
1233                 if( NULL == doublePrograms[ vectorSize ][1] )
1234                 {
1235                     gFailCount++;
1236                     return -1;
1237                 }
1238             }
1239 
1240             doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
1241             if( NULL == kernels[vectorSize][1] )
1242             {
1243                 gFailCount++;
1244                 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
1245                 return error;
1246             }
1247 
1248             if(g_arrVecSizes[vectorSize] == 3) {
1249                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
1250                 if( NULL == doublePrograms[ vectorSize ][2] )
1251                 {
1252                     gFailCount++;
1253                     return -1;
1254                 }
1255             } else {
1256                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
1257                 if( NULL == doublePrograms[ vectorSize ][2] )
1258                 {
1259                     gFailCount++;
1260                     return -1;
1261                 }
1262             }
1263 
1264             doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
1265             if( NULL == kernels[vectorSize][2] )
1266             {
1267                 gFailCount++;
1268                 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
1269                 return error;
1270             }
1271         }
1272     }
1273 
1274     // Figure out how many elements are in a work block
1275     size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
1276     size_t blockCount = BUFFER_SIZE / elementSize;
1277     uint64_t lastCase = 1ULL << (8*sizeof(float));
1278     size_t stride = blockCount;
1279 
1280     if (gWimpyMode)
1281         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1282 
1283     // we handle 64-bit types a bit differently.
1284     if( lastCase == 0 )
1285         lastCase = 0x100000000ULL;
1286     uint64_t i, j;
1287     error = 0;
1288     uint64_t printMask = (lastCase >> 4) - 1;
1289     cl_uint count = 0;
1290     int addressSpace;
1291     size_t loopCount;
1292     cl_uint threadCount = GetThreadCount();
1293 
1294     ComputeReferenceInfoF fref;
1295     fref.x = (float *)gIn_single;
1296     fref.r = (cl_half *)gOut_half_reference;
1297     fref.f = referenceFunc;
1298     fref.lim = blockCount;
1299     fref.count = (blockCount + threadCount - 1) / threadCount;
1300 
1301     CheckResultInfoF fchk;
1302     fchk.x = (const float *)gIn_single;
1303     fchk.r = (const cl_half *)gOut_half_reference;
1304     fchk.s = (const cl_half *)gOut_half;
1305     fchk.f = referenceFunc;
1306     fchk.lim = blockCount;
1307     fchk.count = (blockCount + threadCount - 1) / threadCount;
1308 
1309     ComputeReferenceInfoD dref;
1310     dref.x = (double *)gIn_double;
1311     dref.r = (cl_half *)gOut_half_reference_double;
1312     dref.f = doubleReferenceFunc;
1313     dref.lim = blockCount;
1314     dref.count = (blockCount + threadCount - 1) / threadCount;
1315 
1316     CheckResultInfoD dchk;
1317     dchk.x = (const double *)gIn_double;
1318     dchk.r = (const cl_half *)gOut_half_reference_double;
1319     dchk.s = (const cl_half *)gOut_half;
1320     dchk.f = doubleReferenceFunc;
1321     dchk.lim = blockCount;
1322     dchk.count = (blockCount + threadCount - 1) / threadCount;
1323 
1324     for( i = 0; i < (uint64_t)lastCase; i += stride )
1325     {
1326         count = (cl_uint) MIN( blockCount, lastCase - i );
1327         fref.i = i;
1328         dref.i = i;
1329 
1330         // Create the input and reference
1331         ThreadPool_Do(ReferenceF, threadCount, &fref);
1332 
1333         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
1334         if (error) {
1335             vlog_error( "Failure in clWriteArray\n" );
1336             gFailCount++;
1337             goto exit;
1338         }
1339 
1340         if (gTestDouble) {
1341             ThreadPool_Do(ReferenceD, threadCount, &dref);
1342 
1343             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
1344             if (error) {
1345                 vlog_error( "Failure in clWriteArray\n" );
1346                 gFailCount++;
1347                 goto exit;
1348             }
1349         }
1350 
1351         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
1352             // Loop over vector legths
1353             fchk.vsz = g_arrVecSizes[vectorSize];
1354             dchk.vsz = g_arrVecSizes[vectorSize];
1355 
1356             for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1357                 // Loop over address spaces
1358                 fchk.aspace = addressSpaceNames[addressSpace];
1359                 dchk.aspace = addressSpaceNames[addressSpace];
1360 
1361                 cl_uint pattern = 0xdeaddead;
1362                 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1363 
1364                 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
1365                                              0, count * sizeof(cl_half),
1366                                              gOut_half, 0, NULL, NULL);
1367                 if (error) {
1368                     vlog_error( "Failure in clWriteArray\n" );
1369                     gFailCount++;
1370                     goto exit;
1371                 }
1372 
1373                 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
1374                                   numVecs(count, vectorSize, aligned),
1375                                   runsOverBy(count, vectorSize, aligned));
1376                 if (error) {
1377                     gFailCount++;
1378                     goto exit;
1379                 }
1380 
1381                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1382                                             count * sizeof(cl_half), gOut_half,
1383                                             0, NULL, NULL);
1384                 if (error) {
1385                     vlog_error( "Failure in clReadArray\n" );
1386                     gFailCount++;
1387                     goto exit;
1388                 }
1389 
1390                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1391                 if (error) {
1392                             gFailCount++;
1393                             goto exit;
1394                         }
1395 
1396                 if (gTestDouble) {
1397                     memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1398 
1399                     error = clEnqueueWriteBuffer(
1400                         gQueue, gOutBuffer_half, CL_FALSE, 0,
1401                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1402                     if (error) {
1403                         vlog_error( "Failure in clWriteArray\n" );
1404                         gFailCount++;
1405                         goto exit;
1406                     }
1407 
1408                     error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
1409                                       numVecs(count, vectorSize, aligned),
1410                                       runsOverBy(count, vectorSize, aligned));
1411                     if (error) {
1412                         gFailCount++;
1413                         goto exit;
1414                     }
1415 
1416                     error = clEnqueueReadBuffer(
1417                         gQueue, gOutBuffer_half, CL_TRUE, 0,
1418                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1419                     if (error) {
1420                         vlog_error( "Failure in clReadArray\n" );
1421                         gFailCount++;
1422                         goto exit;
1423                     }
1424 
1425                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
1426                     if (error) {
1427                                 gFailCount++;
1428                                 goto exit;
1429                             }
1430                         }
1431                     }
1432         }  // end for vector size
1433 
1434         if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
1435             vlog( "." );
1436             fflush( stdout );
1437         }
1438     }  // for end lastcase
1439 
1440     loopCount = count == blockCount ? 1 : 100;
1441     if( gReportTimes )
1442     {
1443         //Init the input stream
1444         cl_float *p = (cl_float *)gIn_single;
1445         for( j = 0; j < count; j++ )
1446             p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1447 
1448         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
1449         {
1450             vlog_error( "Failure in clWriteArray\n" );
1451             gFailCount++;
1452             goto exit;
1453         }
1454 
1455         if( gTestDouble )
1456         {
1457             //Init the input stream
1458             cl_double *q = (cl_double *)gIn_double;
1459             for( j = 0; j < count; j++ )
1460                 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1461 
1462             if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
1463             {
1464                 vlog_error( "Failure in clWriteArray\n" );
1465                 gFailCount++;
1466                 goto exit;
1467             }
1468         }
1469 
1470         //Run again for timing
1471         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1472         {
1473             uint64_t bestTime = -1ULL;
1474             for( j = 0; j < loopCount; j++ )
1475             {
1476                 uint64_t startTime = ReadTime();
1477                 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1478                                        runsOverBy(count, vectorSize, aligned)) ) )
1479                 {
1480                     gFailCount++;
1481                     goto exit;
1482                 }
1483 
1484                 if( (error = clFinish(gQueue)) )
1485                 {
1486                     vlog_error( "Failure in clFinish\n" );
1487                     gFailCount++;
1488                     goto exit;
1489                 }
1490                 uint64_t currentTime = ReadTime() - startTime;
1491                 if( currentTime < bestTime )
1492                     bestTime = currentTime;
1493                 time[ vectorSize ] += currentTime;
1494             }
1495             if( bestTime < min_time[ vectorSize ] )
1496                 min_time[ vectorSize ] = bestTime ;
1497 
1498             if( gTestDouble )
1499             {
1500                 bestTime = -1ULL;
1501                 for( j = 0; j < loopCount; j++ )
1502                 {
1503                     uint64_t startTime = ReadTime();
1504                     if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1505                                            runsOverBy(count, vectorSize, aligned)) ) )
1506                     {
1507                         gFailCount++;
1508                         goto exit;
1509                     }
1510 
1511                     if( (error = clFinish(gQueue)) )
1512                     {
1513                         vlog_error( "Failure in clFinish\n" );
1514                         gFailCount++;
1515                         goto exit;
1516                     }
1517                     uint64_t currentTime = ReadTime() - startTime;
1518                     if( currentTime < bestTime )
1519                         bestTime = currentTime;
1520                     doubleTime[ vectorSize ] += currentTime;
1521                 }
1522                 if( bestTime < min_double_time[ vectorSize ] )
1523                     min_double_time[ vectorSize ] = bestTime;
1524             }
1525         }
1526     }
1527 
1528     if( gReportTimes )
1529     {
1530         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1531             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1532                       "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1533         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1534             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1535                       "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
1536         if( gTestDouble )
1537         {
1538             for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1539                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1540                           "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
1541             for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1542                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1543                           "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1544         }
1545     }
1546 
1547 exit:
1548     //clean up
1549     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1550     {
1551         for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1552             clReleaseKernel( kernels[ vectorSize ][addressSpace] );
1553             clReleaseProgram( programs[ vectorSize ][addressSpace] );
1554             if( gTestDouble )
1555             {
1556                 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
1557                 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
1558             }
1559         }
1560     }
1561 
1562     return error;
1563 }
1564 
1565