1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <string.h>
17 #include "cl_utils.h"
18 #include "tests.h"
19 #include "harness/testHarness.h"
20
test_roundTrip(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)21 int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
22 {
23 int vectorSize, error;
24 uint64_t i, j;
25 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
26 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
27 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
28 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
29 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
30 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
31 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33 memset( min_time, -1, sizeof( min_time ) );
34 memset( min_double_time, -1, sizeof( min_double_time ) );
35
36 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
37 {
38 const char *source[] = {
39 "__kernel void test( const __global half *in, __global half *out )\n"
40 "{\n"
41 " size_t i = get_global_id(0);\n"
42 " vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in), i, out);\n"
43 "}\n"
44 };
45
46 const char *doubleSource[] = {
47 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
48 "__kernel void test( const __global half *in, __global half *out )\n"
49 "{\n"
50 " size_t i = get_global_id(0);\n"
51 " vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)), i, out);\n"
52 "}\n"
53 };
54
55 const char *sourceV3[] = {
56 "__kernel void test( const __global half *in, __global half *out,"
57 " uint extra_last_thread )\n"
58 "{\n"
59 " size_t i = get_global_id(0);\n"
60 " size_t last_i = get_global_size(0)-1;\n"
61 " size_t adjust = 0;\n"
62 " if(i == last_i && extra_last_thread != 0) { \n"
63 " adjust = 3-extra_last_thread;\n"
64 " }\n"
65 " vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
66 "}\n"
67 };
68
69 const char *doubleSourceV3[] = {
70 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
71 "__kernel void test( const __global half *in, __global half *out,"
72 " uint extra_last_thread )\n"
73 "{\n"
74 " size_t i = get_global_id(0);\n"
75 " size_t last_i = get_global_size(0)-1;\n"
76 " size_t adjust = 0;\n"
77 " if(i == last_i && extra_last_thread != 0) { \n"
78 " adjust = 3-extra_last_thread;\n"
79 " }\n"
80 " vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
81 "}\n"
82 };
83
84 /*
85 const char *sourceV3aligned[] = {
86 "__kernel void test( const __global half *in, __global half *out )\n"
87 "{\n"
88 " size_t i = get_global_id(0);\n"
89 " vstorea_half3( vloada_half3(i, in), i, out);\n"
90 " vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
91 "}\n"
92 };
93
94 const char *doubleSourceV3aligned[] = {
95 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
96 "__kernel void test( const __global half *in, __global half *out )\n"
97 "{\n"
98 " size_t i = get_global_id(0);\n"
99 " vstorea_half3( vloada_half3(i, in), i, out);\n"
100 " vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
101 "}\n"
102 };
103 */
104
105 if(g_arrVecSizes[vectorSize] == 3) {
106 programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
107 if( NULL == programs[ vectorSize ] )
108 {
109 gFailCount++;
110
111 return -1;
112 }
113 } else {
114 programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0]) );
115 if( NULL == programs[ vectorSize ] )
116 {
117 gFailCount++;
118 return -1;
119 }
120 }
121
122 kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
123 if( NULL == kernels[vectorSize] )
124 {
125 gFailCount++;
126 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
127 return error;
128 }
129
130 if( gTestDouble )
131 {
132 if(g_arrVecSizes[vectorSize] == 3) {
133 doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0]) );
134 if( NULL == doublePrograms[ vectorSize ] )
135 {
136 gFailCount++;
137 return -1;
138 }
139 } else {
140 doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0]) );
141 if( NULL == doublePrograms[ vectorSize ] )
142 {
143 gFailCount++;
144 return -1;
145 }
146 }
147
148 doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
149 if( NULL == doubleKernels[vectorSize] )
150 {
151 gFailCount++;
152 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
153 return error;
154 }
155 }
156 }
157
158 // Figure out how many elements are in a work block
159 size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
160 size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
161 uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
162 size_t stride = blockCount;
163
164 error = 0;
165 uint64_t printMask = (lastCase >> 4) - 1;
166 uint32_t count;
167 size_t loopCount;
168
169 for( i = 0; i < (uint64_t)lastCase; i += stride )
170 {
171 count = (uint32_t) MIN( blockCount, lastCase - i );
172
173 //Init the input stream
174 uint16_t *p = (uint16_t *)gIn_half;
175 for( j = 0; j < count; j++ )
176 p[j] = j + i;
177
178 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
179 {
180 vlog_error( "Failure in clWriteArray\n" );
181 gFailCount++;
182 goto exit;
183 }
184
185 //Check the vector lengths
186 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
187 { // here we loop through vector sizes -- 3 is last.
188 uint32_t pattern = 0xdeaddead;
189 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
190
191 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
192 {
193 vlog_error( "Failure in clWriteArray\n" );
194 gFailCount++;
195 goto exit;
196 }
197
198
199 // here is where "3" starts to cause problems.
200 error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
201 runsOverBy(count, vectorSize, false) );
202 if(error)
203 {
204 gFailCount++;
205 goto exit;
206 }
207
208 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
209 {
210 vlog_error( "Failure in clReadArray\n" );
211 gFailCount++;
212 goto exit;
213 }
214
215 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
216 {
217 uint16_t *u1 = (uint16_t *)gOut_half;
218 uint16_t *u2 = (uint16_t *)gIn_half;
219 for( j = 0; j < count; j++ )
220 {
221 if( u1[j] != u2[j] )
222 {
223 uint16_t abs1 = u1[j] & 0x7fff;
224 uint16_t abs2 = u2[j] & 0x7fff;
225 if( abs1 > 0x7c00 && abs2 > 0x7c00 )
226 continue; //any NaN is okay if NaN is input
227
228 // if reference result is sub normal, test if the output is flushed to zero
229 if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
230 continue;
231
232 vlog_error( "%lld) (of %lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
233 gFailCount++;
234 error = -1;
235 goto exit;
236 }
237 }
238 }
239
240 if( gTestDouble )
241 {
242 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
243 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
244 {
245 vlog_error( "Failure in clWriteArray\n" );
246 gFailCount++;
247 goto exit;
248 }
249
250
251 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
252 runsOverBy(count, vectorSize, false) ) ) )
253 {
254 gFailCount++;
255 goto exit;
256 }
257
258 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
259 {
260 vlog_error( "Failure in clReadArray\n" );
261 gFailCount++;
262 goto exit;
263 }
264
265 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
266 {
267 uint16_t *u1 = (uint16_t *)gOut_half;
268 uint16_t *u2 = (uint16_t *)gIn_half;
269 for( j = 0; j < count; j++ )
270 {
271 if( u1[j] != u2[j] )
272 {
273 uint16_t abs1 = u1[j] & 0x7fff;
274 uint16_t abs2 = u2[j] & 0x7fff;
275 if( abs1 > 0x7c00 && abs2 > 0x7c00 )
276 continue; //any NaN is okay if NaN is input
277
278 // if reference result is sub normal, test if the output is flushed to zero
279 if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
280 continue;
281
282 vlog_error( "%lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
283 gFailCount++;
284 error = -1;
285 goto exit;
286 }
287 }
288 }
289 }
290 }
291
292 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
293 {
294 vlog( "." );
295 fflush( stdout );
296 }
297 }
298
299 vlog( "\n" );
300
301 loopCount = 100;
302 if( gReportTimes )
303 {
304 //Run again for timing
305 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
306 {
307 uint64_t bestTime = -1ULL;
308
309 for( j = 0; j < loopCount; j++ )
310 {
311 uint64_t startTime = ReadTime();
312 if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
313 runsOverBy(count, vectorSize, false)) ) )
314 {
315 gFailCount++;
316 goto exit;
317 }
318
319 if( (error = clFinish(gQueue)) )
320 {
321 vlog_error( "Failure in clFinish\n" );
322 gFailCount++;
323 goto exit;
324 }
325 uint64_t currentTime = ReadTime() - startTime;
326 if( currentTime < bestTime )
327 bestTime = currentTime;
328 time[ vectorSize ] += currentTime;
329 }
330 if( bestTime < min_time[ vectorSize ] )
331 min_time[ vectorSize ] = bestTime;
332
333 if( gTestDouble )
334 {
335 bestTime = -1ULL;
336 for( j = 0; j < loopCount; j++ )
337 {
338 uint64_t startTime = ReadTime();
339 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
340 runsOverBy(count, vectorSize, false)) ) )
341 {
342 gFailCount++;
343 goto exit;
344 }
345
346 if( (error = clFinish(gQueue)) )
347 {
348 vlog_error( "Failure in clFinish\n" );
349 gFailCount++;
350 goto exit;
351 }
352 uint64_t currentTime = ReadTime() - startTime;
353 if( currentTime < bestTime )
354 bestTime = currentTime;
355 doubleTime[ vectorSize ] += currentTime;
356 }
357 if( bestTime < min_double_time[ vectorSize ] )
358 min_double_time[ vectorSize ] = bestTime;
359 }
360 }
361 }
362
363 if( gReportTimes )
364 {
365 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
366 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
367 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
368 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize]) );
369 if( gTestDouble )
370 {
371 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
372 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
373 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
374 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
375 }
376 }
377
378 exit:
379 //clean up
380 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
381 {
382 clReleaseKernel( kernels[ vectorSize ] );
383 clReleaseProgram( programs[ vectorSize ] );
384 if( gTestDouble )
385 {
386 clReleaseKernel( doubleKernels[ vectorSize ] );
387 clReleaseProgram( doublePrograms[ vectorSize ] );
388 }
389 }
390
391 return error;
392 }
393
394
395