1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <string.h>
17 #include "cl_utils.h"
18 #include "tests.h"
19 #include "harness/testHarness.h"
20 
test_roundTrip(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)21 int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
22 {
23     int vectorSize, error;
24     uint64_t i, j;
25     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
26     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
27     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
28     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
29     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
30     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
31     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33     memset( min_time, -1, sizeof( min_time ) );
34     memset( min_double_time, -1, sizeof( min_double_time ) );
35 
36     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
37     {
38         const char *source[] = {
39             "__kernel void test( const __global half *in, __global half *out )\n"
40             "{\n"
41             "   size_t i = get_global_id(0);\n"
42             "   vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in),  i, out);\n"
43             "}\n"
44         };
45 
46         const char *doubleSource[] = {
47             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
48             "__kernel void test( const __global half *in, __global half *out )\n"
49             "{\n"
50             "   size_t i = get_global_id(0);\n"
51             "   vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)),  i, out);\n"
52             "}\n"
53         };
54 
55         const char *sourceV3[] = {
56             "__kernel void test( const __global half *in, __global half *out,"
57             "                    uint extra_last_thread  )\n"
58             "{\n"
59             "   size_t i = get_global_id(0);\n"
60             "   size_t last_i = get_global_size(0)-1;\n"
61             "   size_t adjust = 0;\n"
62             "   if(i == last_i && extra_last_thread != 0) { \n"
63             "     adjust = 3-extra_last_thread;\n"
64             "   }\n"
65             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
66             "}\n"
67         };
68 
69         const char *doubleSourceV3[] = {
70             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
71             "__kernel void test( const __global half *in, __global half *out,"
72             "                    uint extra_last_thread  )\n"
73             "{\n"
74             "   size_t i = get_global_id(0);\n"
75             "   size_t last_i = get_global_size(0)-1;\n"
76             "   size_t adjust = 0;\n"
77             "   if(i == last_i && extra_last_thread != 0) { \n"
78             "     adjust = 3-extra_last_thread;\n"
79             "   }\n"
80             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
81             "}\n"
82         };
83 
84 /*
85         const char *sourceV3aligned[] = {
86             "__kernel void test( const __global half *in, __global half *out )\n"
87             "{\n"
88             "   size_t i = get_global_id(0);\n"
89             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
90             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
91             "}\n"
92         };
93 
94         const char *doubleSourceV3aligned[] = {
95             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
96             "__kernel void test( const __global half *in, __global half *out )\n"
97             "{\n"
98             "   size_t i = get_global_id(0);\n"
99             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
100             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
101             "}\n"
102         };
103 */
104 
105         if(g_arrVecSizes[vectorSize] == 3) {
106             programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0])  );
107             if( NULL == programs[ vectorSize ] )
108             {
109                 gFailCount++;
110 
111                 return -1;
112             }
113         } else {
114             programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0])  );
115             if( NULL == programs[ vectorSize ] )
116             {
117                 gFailCount++;
118                 return -1;
119             }
120         }
121 
122         kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
123         if( NULL == kernels[vectorSize] )
124         {
125             gFailCount++;
126             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
127             return error;
128         }
129 
130         if( gTestDouble )
131         {
132             if(g_arrVecSizes[vectorSize] == 3) {
133                 doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0])  );
134                 if( NULL == doublePrograms[ vectorSize ] )
135                 {
136                     gFailCount++;
137                     return -1;
138                 }
139             } else {
140                 doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0])  );
141                 if( NULL == doublePrograms[ vectorSize ] )
142                 {
143                     gFailCount++;
144                     return -1;
145                 }
146             }
147 
148             doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
149             if( NULL == doubleKernels[vectorSize] )
150             {
151                 gFailCount++;
152                 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
153                 return error;
154             }
155         }
156     }
157 
158     // Figure out how many elements are in a work block
159     size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
160     size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
161     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
162     size_t stride = blockCount;
163 
164     error = 0;
165     uint64_t printMask = (lastCase >> 4) - 1;
166     uint32_t count;
167     size_t loopCount;
168 
169     for( i = 0; i < (uint64_t)lastCase; i += stride )
170     {
171         count = (uint32_t) MIN( blockCount, lastCase - i );
172 
173         //Init the input stream
174         uint16_t *p = (uint16_t *)gIn_half;
175         for( j = 0; j < count; j++ )
176             p[j] = j + i;
177 
178         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
179         {
180             vlog_error( "Failure in clWriteArray\n" );
181             gFailCount++;
182             goto exit;
183         }
184 
185         //Check the vector lengths
186         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
187         { // here we loop through vector sizes -- 3 is last.
188             uint32_t pattern = 0xdeaddead;
189             memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
190 
191             if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
192             {
193                 vlog_error( "Failure in clWriteArray\n" );
194                 gFailCount++;
195                 goto exit;
196             }
197 
198 
199             // here is where "3" starts to cause problems.
200             error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
201                               runsOverBy(count, vectorSize, false) );
202             if(error)
203             {
204                 gFailCount++;
205                 goto exit;
206             }
207 
208             if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
209             {
210                 vlog_error( "Failure in clReadArray\n" );
211                 gFailCount++;
212                 goto exit;
213             }
214 
215             if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
216             {
217                 uint16_t *u1 = (uint16_t *)gOut_half;
218                 uint16_t *u2 = (uint16_t *)gIn_half;
219                 for( j = 0; j < count; j++ )
220                 {
221                     if( u1[j] != u2[j] )
222                     {
223                         uint16_t abs1 = u1[j] & 0x7fff;
224                         uint16_t abs2 = u2[j] & 0x7fff;
225                         if( abs1 > 0x7c00 && abs2 > 0x7c00 )
226                             continue; //any NaN is okay if NaN is input
227 
228                         // if reference result is sub normal, test if the output is flushed to zero
229                         if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
230                             continue;
231 
232                         vlog_error( "%lld) (of %lld)  Failure at 0x%4.4x:  0x%4.4x   vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
233                         gFailCount++;
234                         error = -1;
235                         goto exit;
236                     }
237                 }
238             }
239 
240             if( gTestDouble )
241             {
242                 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
243                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
244                 {
245                     vlog_error( "Failure in clWriteArray\n" );
246                     gFailCount++;
247                     goto exit;
248                 }
249 
250 
251                 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
252                                        runsOverBy(count, vectorSize, false) ) ) )
253                 {
254                     gFailCount++;
255                     goto exit;
256                 }
257 
258                 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
259                 {
260                     vlog_error( "Failure in clReadArray\n" );
261                     gFailCount++;
262                     goto exit;
263                 }
264 
265                 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
266                 {
267                     uint16_t *u1 = (uint16_t *)gOut_half;
268                     uint16_t *u2 = (uint16_t *)gIn_half;
269                     for( j = 0; j < count; j++ )
270                     {
271                         if( u1[j] != u2[j] )
272                         {
273                             uint16_t abs1 = u1[j] & 0x7fff;
274                             uint16_t abs2 = u2[j] & 0x7fff;
275                             if( abs1 > 0x7c00 && abs2 > 0x7c00 )
276                                 continue; //any NaN is okay if NaN is input
277 
278                             // if reference result is sub normal, test if the output is flushed to zero
279                             if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
280                                 continue;
281 
282                             vlog_error( "%lld) Failure at 0x%4.4x:  0x%4.4x   vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
283                             gFailCount++;
284                             error = -1;
285                             goto exit;
286                         }
287                     }
288                 }
289             }
290         }
291 
292         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
293         {
294             vlog( "." );
295             fflush( stdout );
296         }
297     }
298 
299     vlog( "\n" );
300 
301     loopCount = 100;
302     if( gReportTimes )
303     {
304         //Run again for timing
305         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
306         {
307             uint64_t bestTime = -1ULL;
308 
309             for( j = 0; j < loopCount; j++ )
310             {
311                 uint64_t startTime = ReadTime();
312                 if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
313                                        runsOverBy(count, vectorSize, false)) ) )
314                 {
315                     gFailCount++;
316                     goto exit;
317                 }
318 
319                 if( (error = clFinish(gQueue)) )
320                 {
321                     vlog_error( "Failure in clFinish\n" );
322                     gFailCount++;
323                     goto exit;
324                 }
325                 uint64_t currentTime = ReadTime() - startTime;
326                 if( currentTime < bestTime )
327                     bestTime = currentTime;
328                 time[ vectorSize ] += currentTime;
329             }
330             if( bestTime < min_time[ vectorSize ] )
331                 min_time[ vectorSize ] = bestTime;
332 
333             if( gTestDouble )
334             {
335                 bestTime = -1ULL;
336                 for( j = 0; j < loopCount; j++ )
337                 {
338                     uint64_t startTime = ReadTime();
339                     if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
340                                            runsOverBy(count, vectorSize, false)) ) )
341                     {
342                         gFailCount++;
343                         goto exit;
344                     }
345 
346                     if( (error = clFinish(gQueue)) )
347                     {
348                         vlog_error( "Failure in clFinish\n" );
349                         gFailCount++;
350                         goto exit;
351                     }
352                     uint64_t currentTime = ReadTime() - startTime;
353                     if( currentTime < bestTime )
354                         bestTime = currentTime;
355                     doubleTime[ vectorSize ] += currentTime;
356                 }
357                 if( bestTime < min_double_time[ vectorSize ] )
358                     min_double_time[ vectorSize ] = bestTime;
359             }
360         }
361     }
362 
363     if( gReportTimes )
364     {
365         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
366             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
367         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
368             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize])  );
369         if( gTestDouble )
370         {
371             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
372                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize])  );
373             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
374                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
375         }
376     }
377 
378 exit:
379     //clean up
380     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
381     {
382         clReleaseKernel( kernels[ vectorSize ] );
383         clReleaseProgram( programs[ vectorSize ] );
384         if( gTestDouble )
385         {
386             clReleaseKernel( doubleKernels[ vectorSize ] );
387             clReleaseProgram( doublePrograms[ vectorSize ] );
388         }
389     }
390 
391     return error;
392 }
393 
394 
395