1 //
2 // Copyright (c) 2017,2021 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "imageHelpers.h"
17 #include <limits.h>
18 #include <assert.h>
19 #if defined(__APPLE__)
20 #include <sys/mman.h>
21 #endif
22 #if !defined(_WIN32) && !defined(__APPLE__)
23 #include <malloc.h>
24 #endif
25 #include <algorithm>
26 #include <iterator>
27 #if !defined(_WIN32)
28 #include <cmath>
29 #endif
30 
31 RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
32 
33 cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
34 bool gTestRounding = false;
sRGBmap(float fc)35 double sRGBmap(float fc)
36 {
37     double c = (double)fc;
38 
39 #if !defined(_WIN32)
40     if (std::isnan(c)) c = 0.0;
41 #else
42     if (_isnan(c)) c = 0.0;
43 #endif
44 
45     if (c > 1.0)
46         c = 1.0;
47     else if (c < 0.0)
48         c = 0.0;
49     else if (c < 0.0031308)
50         c = 12.92 * c;
51     else
52         c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0);
53 
54     return c * 255.0;
55 }
56 
sRGBunmap(float fc)57 double sRGBunmap(float fc)
58 {
59     double c = (double)fc;
60     double result;
61 
62     if (c <= 0.04045)
63         result = c / 12.92;
64     else
65         result = pow((c + 0.055) / 1.055, 2.4);
66 
67     return result;
68 }
69 
70 
get_format_type_size(const cl_image_format * format)71 uint32_t get_format_type_size(const cl_image_format *format)
72 {
73     return get_channel_data_type_size(format->image_channel_data_type);
74 }
75 
get_channel_data_type_size(cl_channel_type channelType)76 uint32_t get_channel_data_type_size(cl_channel_type channelType)
77 {
78     switch (channelType)
79     {
80         case CL_SNORM_INT8:
81         case CL_UNORM_INT8:
82         case CL_SIGNED_INT8:
83         case CL_UNSIGNED_INT8: return 1;
84 
85         case CL_SNORM_INT16:
86         case CL_UNORM_INT16:
87         case CL_SIGNED_INT16:
88         case CL_UNSIGNED_INT16:
89         case CL_HALF_FLOAT:
90 #ifdef CL_SFIXED14_APPLE
91         case CL_SFIXED14_APPLE:
92 #endif
93             return sizeof(cl_short);
94 
95         case CL_SIGNED_INT32:
96         case CL_UNSIGNED_INT32: return sizeof(cl_int);
97 
98         case CL_UNORM_SHORT_565:
99         case CL_UNORM_SHORT_555:
100 #ifdef OBSOLETE_FORAMT
101         case CL_UNORM_SHORT_565_REV:
102         case CL_UNORM_SHORT_555_REV:
103 #endif
104             return 2;
105 
106 #ifdef OBSOLETE_FORAMT
107         case CL_UNORM_INT_8888:
108         case CL_UNORM_INT_8888_REV: return 4;
109 #endif
110 
111         case CL_UNORM_INT_101010:
112 #ifdef OBSOLETE_FORAMT
113         case CL_UNORM_INT_101010_REV:
114 #endif
115             return 4;
116 
117         case CL_FLOAT: return sizeof(cl_float);
118 
119         default: return 0;
120     }
121 }
122 
get_format_channel_count(const cl_image_format * format)123 uint32_t get_format_channel_count(const cl_image_format *format)
124 {
125     return get_channel_order_channel_count(format->image_channel_order);
126 }
127 
get_channel_order_channel_count(cl_channel_order order)128 uint32_t get_channel_order_channel_count(cl_channel_order order)
129 {
130     switch (order)
131     {
132         case CL_R:
133         case CL_A:
134         case CL_Rx:
135         case CL_INTENSITY:
136         case CL_LUMINANCE:
137         case CL_DEPTH:
138         case CL_DEPTH_STENCIL: return 1;
139 
140         case CL_RG:
141         case CL_RA:
142         case CL_RGx: return 2;
143 
144         case CL_RGB:
145         case CL_RGBx:
146         case CL_sRGB:
147         case CL_sRGBx: return 3;
148 
149         case CL_RGBA:
150         case CL_ARGB:
151         case CL_BGRA:
152         case CL_sRGBA:
153         case CL_sBGRA:
154         case CL_ABGR:
155 #ifdef CL_1RGB_APPLE
156         case CL_1RGB_APPLE:
157 #endif
158 #ifdef CL_BGR1_APPLE
159         case CL_BGR1_APPLE:
160 #endif
161 #ifdef CL_ABGR_APPLE
162         case CL_ABGR_APPLE:
163 #endif
164             return 4;
165 
166         default:
167             log_error("%s does not support 0x%x\n", __FUNCTION__, order);
168             return 0;
169     }
170 }
171 
get_channel_type_from_name(const char * name)172 cl_channel_type get_channel_type_from_name(const char *name)
173 {
174     struct
175     {
176         cl_channel_type type;
177         const char *name;
178     } typeNames[] = { { CL_SNORM_INT8, "CL_SNORM_INT8" },
179                       { CL_SNORM_INT16, "CL_SNORM_INT16" },
180                       { CL_UNORM_INT8, "CL_UNORM_INT8" },
181                       { CL_UNORM_INT16, "CL_UNORM_INT16" },
182                       { CL_UNORM_INT24, "CL_UNORM_INT24" },
183                       { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
184                       { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
185                       { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
186                       { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
187                       { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
188                       { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
189                       { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
190                       { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
191                       { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
192                       { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
193                       { CL_FLOAT, "CL_FLOAT" },
194 #ifdef CL_SFIXED14_APPLE
195                       { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
196 #endif
197     };
198     for (size_t i = 0; i < sizeof(typeNames) / sizeof(typeNames[0]); i++)
199     {
200         if (strcmp(typeNames[i].name, name) == 0
201             || strcmp(typeNames[i].name + 3, name) == 0)
202             return typeNames[i].type;
203     }
204     return (cl_channel_type)-1;
205 }
206 
get_channel_order_from_name(const char * name)207 cl_channel_order get_channel_order_from_name(const char *name)
208 {
209     const struct
210     {
211         cl_channel_order order;
212         const char *name;
213     } orderNames[] = {
214         { CL_R, "CL_R" },
215         { CL_A, "CL_A" },
216         { CL_Rx, "CL_Rx" },
217         { CL_RG, "CL_RG" },
218         { CL_RA, "CL_RA" },
219         { CL_RGx, "CL_RGx" },
220         { CL_RGB, "CL_RGB" },
221         { CL_RGBx, "CL_RGBx" },
222         { CL_RGBA, "CL_RGBA" },
223         { CL_BGRA, "CL_BGRA" },
224         { CL_ARGB, "CL_ARGB" },
225         { CL_INTENSITY, "CL_INTENSITY" },
226         { CL_LUMINANCE, "CL_LUMINANCE" },
227         { CL_DEPTH, "CL_DEPTH" },
228         { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" },
229         { CL_sRGB, "CL_sRGB" },
230         { CL_sRGBx, "CL_sRGBx" },
231         { CL_sRGBA, "CL_sRGBA" },
232         { CL_sBGRA, "CL_sBGRA" },
233         { CL_ABGR, "CL_ABGR" },
234 #ifdef CL_1RGB_APPLE
235         { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
236 #endif
237 #ifdef CL_BGR1_APPLE
238         { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
239 #endif
240     };
241 
242     for (size_t i = 0; i < sizeof(orderNames) / sizeof(orderNames[0]); i++)
243     {
244         if (strcmp(orderNames[i].name, name) == 0
245             || strcmp(orderNames[i].name + 3, name) == 0)
246             return orderNames[i].order;
247     }
248     return (cl_channel_order)-1;
249 }
250 
251 
is_format_signed(const cl_image_format * format)252 int is_format_signed(const cl_image_format *format)
253 {
254     switch (format->image_channel_data_type)
255     {
256         case CL_SNORM_INT8:
257         case CL_SIGNED_INT8:
258         case CL_SNORM_INT16:
259         case CL_SIGNED_INT16:
260         case CL_SIGNED_INT32:
261         case CL_HALF_FLOAT:
262         case CL_FLOAT:
263 #ifdef CL_SFIXED14_APPLE
264         case CL_SFIXED14_APPLE:
265 #endif
266             return 1;
267 
268         default: return 0;
269     }
270 }
271 
get_pixel_size(const cl_image_format * format)272 uint32_t get_pixel_size(const cl_image_format *format)
273 {
274     switch (format->image_channel_data_type)
275     {
276         case CL_SNORM_INT8:
277         case CL_UNORM_INT8:
278         case CL_SIGNED_INT8:
279         case CL_UNSIGNED_INT8: return get_format_channel_count(format);
280 
281         case CL_SNORM_INT16:
282         case CL_UNORM_INT16:
283         case CL_SIGNED_INT16:
284         case CL_UNSIGNED_INT16:
285         case CL_HALF_FLOAT:
286 #ifdef CL_SFIXED14_APPLE
287         case CL_SFIXED14_APPLE:
288 #endif
289             return get_format_channel_count(format) * sizeof(cl_ushort);
290 
291         case CL_SIGNED_INT32:
292         case CL_UNSIGNED_INT32:
293             return get_format_channel_count(format) * sizeof(cl_int);
294 
295         case CL_UNORM_SHORT_565:
296         case CL_UNORM_SHORT_555:
297 #ifdef OBSOLETE_FORAMT
298         case CL_UNORM_SHORT_565_REV:
299         case CL_UNORM_SHORT_555_REV:
300 #endif
301             return 2;
302 
303 #ifdef OBSOLETE_FORAMT
304         case CL_UNORM_INT_8888:
305         case CL_UNORM_INT_8888_REV: return 4;
306 #endif
307 
308         case CL_UNORM_INT_101010:
309 #ifdef OBSOLETE_FORAMT
310         case CL_UNORM_INT_101010_REV:
311 #endif
312             return 4;
313 
314         case CL_FLOAT:
315             return get_format_channel_count(format) * sizeof(cl_float);
316 
317         default: return 0;
318     }
319 }
320 
next_power_of_two(uint32_t v)321 uint32_t next_power_of_two(uint32_t v)
322 {
323     v--;
324     v |= v >> 1;
325     v |= v >> 2;
326     v |= v >> 4;
327     v |= v >> 8;
328     v |= v >> 16;
329     v++;
330     return v;
331 }
332 
get_pixel_alignment(const cl_image_format * format)333 uint32_t get_pixel_alignment(const cl_image_format *format)
334 {
335     return next_power_of_two(get_pixel_size(format));
336 }
337 
get_8_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)338 int get_8_bit_image_format(cl_context context, cl_mem_object_type objType,
339                            cl_mem_flags flags, size_t channelCount,
340                            cl_image_format *outFormat)
341 {
342     cl_image_format formatList[128];
343     unsigned int outFormatCount, i;
344     int error;
345 
346 
347     /* Make sure each image format is supported */
348     if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
349                                             formatList, &outFormatCount)))
350         return error;
351 
352 
353     /* Look for one that is an 8-bit format */
354     for (i = 0; i < outFormatCount; i++)
355     {
356         if (formatList[i].image_channel_data_type == CL_SNORM_INT8
357             || formatList[i].image_channel_data_type == CL_UNORM_INT8
358             || formatList[i].image_channel_data_type == CL_SIGNED_INT8
359             || formatList[i].image_channel_data_type == CL_UNSIGNED_INT8)
360         {
361             if (!channelCount
362                 || (channelCount
363                     && (get_format_channel_count(&formatList[i])
364                         == channelCount)))
365             {
366                 *outFormat = formatList[i];
367                 return 0;
368             }
369         }
370     }
371 
372     return -1;
373 }
374 
get_32_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)375 int get_32_bit_image_format(cl_context context, cl_mem_object_type objType,
376                             cl_mem_flags flags, size_t channelCount,
377                             cl_image_format *outFormat)
378 {
379     cl_image_format formatList[128];
380     unsigned int outFormatCount, i;
381     int error;
382 
383 
384     /* Make sure each image format is supported */
385     if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
386                                             formatList, &outFormatCount)))
387         return error;
388 
389     /* Look for one that is an 8-bit format */
390     for (i = 0; i < outFormatCount; i++)
391     {
392         if (formatList[i].image_channel_data_type == CL_UNORM_INT_101010
393             || formatList[i].image_channel_data_type == CL_FLOAT
394             || formatList[i].image_channel_data_type == CL_SIGNED_INT32
395             || formatList[i].image_channel_data_type == CL_UNSIGNED_INT32)
396         {
397             if (!channelCount
398                 || (channelCount
399                     && (get_format_channel_count(&formatList[i])
400                         == channelCount)))
401             {
402                 *outFormat = formatList[i];
403                 return 0;
404             }
405         }
406     }
407 
408     return -1;
409 }
410 
print_first_pixel_difference_error(size_t where,const char * sourcePixel,const char * destPixel,image_descriptor * imageInfo,size_t y,size_t thirdDim)411 void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
412                                         const char *destPixel,
413                                         image_descriptor *imageInfo, size_t y,
414                                         size_t thirdDim)
415 {
416     size_t pixel_size = get_pixel_size(imageInfo->format);
417 
418     log_error("ERROR: Scanline %d did not verify for image size %d,%d,%d "
419               "pitch %d (extra %d bytes)\n",
420               (int)y, (int)imageInfo->width, (int)imageInfo->height,
421               (int)thirdDim, (int)imageInfo->rowPitch,
422               (int)imageInfo->rowPitch
423                   - (int)imageInfo->width * (int)pixel_size);
424     log_error("Failed at column: %ld   ", where);
425 
426     switch (pixel_size)
427     {
428         case 1:
429             log_error("*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar *)sourcePixel)[0],
430                       ((cl_uchar *)destPixel)[0]);
431             break;
432         case 2:
433             log_error("*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort *)sourcePixel)[0],
434                       ((cl_ushort *)destPixel)[0]);
435             break;
436         case 3:
437             log_error("*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. "
438                       "{0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
439                       ((cl_uchar *)sourcePixel)[0],
440                       ((cl_uchar *)sourcePixel)[1],
441                       ((cl_uchar *)sourcePixel)[2], ((cl_uchar *)destPixel)[0],
442                       ((cl_uchar *)destPixel)[1], ((cl_uchar *)destPixel)[2]);
443             break;
444         case 4:
445             log_error("*0x%8.8x vs. 0x%8.8x\n", ((cl_uint *)sourcePixel)[0],
446                       ((cl_uint *)destPixel)[0]);
447             break;
448         case 6:
449             log_error(
450                 "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. "
451                 "{0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
452                 ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)sourcePixel)[1],
453                 ((cl_ushort *)sourcePixel)[2], ((cl_ushort *)destPixel)[0],
454                 ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]);
455             break;
456         case 8:
457             log_error("*0x%16.16llx vs. 0x%16.16llx\n",
458                       ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]);
459             break;
460         case 12:
461             log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
462                       "{0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
463                       ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
464                       ((cl_uint *)sourcePixel)[2], ((cl_uint *)destPixel)[0],
465                       ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2]);
466             break;
467         case 16:
468             log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
469                       "{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
470                       ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
471                       ((cl_uint *)sourcePixel)[2], ((cl_uint *)sourcePixel)[3],
472                       ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1],
473                       ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]);
474             break;
475         default:
476             log_error("Don't know how to print pixel size of %ld\n",
477                       pixel_size);
478             break;
479     }
480 }
481 
random_log_in_range(int minV,int maxV,MTdata d)482 int random_log_in_range(int minV, int maxV, MTdata d)
483 {
484     double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
485     int iv = (int)((float)(maxV - minV) * v);
486     return iv + minV;
487 }
488 
489 
490 // Define the addressing functions
491 typedef int (*AddressFn)(int value, size_t maxValue);
492 
NoAddressFn(int value,size_t maxValue)493 int NoAddressFn(int value, size_t maxValue) { return value; }
RepeatAddressFn(int value,size_t maxValue)494 int RepeatAddressFn(int value, size_t maxValue)
495 {
496     if (value < 0)
497         value += (int)maxValue;
498     else if (value >= (int)maxValue)
499         value -= (int)maxValue;
500     return value;
501 }
MirroredRepeatAddressFn(int value,size_t maxValue)502 int MirroredRepeatAddressFn(int value, size_t maxValue)
503 {
504     if (value < 0)
505         value = 0;
506     else if ((size_t)value >= maxValue)
507         value = (int)(maxValue - 1);
508     return value;
509 }
ClampAddressFn(int value,size_t maxValue)510 int ClampAddressFn(int value, size_t maxValue)
511 {
512     return (value < -1) ? -1
513                         : ((value > (cl_long)maxValue) ? (int)maxValue : value);
514 }
ClampToEdgeNearestFn(int value,size_t maxValue)515 int ClampToEdgeNearestFn(int value, size_t maxValue)
516 {
517     return (value < 0)
518         ? 0
519         : (((size_t)value > maxValue - 1) ? (int)maxValue - 1 : value);
520 }
521 AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn;
522 
523 // Note: normalized coords get repeated in normalized space, not unnormalized
524 // space! hence the special case here
525 volatile float gFloatHome;
RepeatNormalizedAddressFn(float fValue,size_t maxValue)526 float RepeatNormalizedAddressFn(float fValue, size_t maxValue)
527 {
528 #ifndef _MSC_VER // Use original if not the VS compiler.
529     // General computation for repeat
530     return (fValue - floorf(fValue)) * (float)maxValue; // Reduce to [0, 1.f]
531 #else // Otherwise, use this instead:
532     // Home the subtraction to a float to break up the sequence of x87
533     // instructions emitted by the VS compiler.
534     gFloatHome = fValue - floorf(fValue);
535     return gFloatHome * (float)maxValue;
536 #endif
537 }
538 
MirroredRepeatNormalizedAddressFn(float fValue,size_t maxValue)539 float MirroredRepeatNormalizedAddressFn(float fValue, size_t maxValue)
540 {
541     // Round to nearest multiple of two.
542     // Note halfway values flip flop here due to rte, but they both end up
543     // pointing the same place at the end of the day.
544     float s_prime = 2.0f * rintf(fValue * 0.5f);
545 
546     // Reduce to [-1, 1], Apply mirroring -> [0, 1]
547     s_prime = fabsf(fValue - s_prime);
548 
549     // un-normalize
550     return s_prime * (float)maxValue;
551 }
552 
553 struct AddressingTable
554 {
AddressingTableAddressingTable555     AddressingTable()
556     {
557         ct_assert((CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6));
558         ct_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2);
559 
560         mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
561               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn;
562         mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
563               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = NoAddressFn;
564         mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
565               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = RepeatAddressFn;
566         mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
567               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = RepeatAddressFn;
568         mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
569               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampToEdgeNearestFn;
570         mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
571               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampToEdgeLinearFn;
572         mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
573               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampAddressFn;
574         mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
575               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampAddressFn;
576         mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
577               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
578         mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
579               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
580     }
581 
operator []AddressingTable582     AddressFn operator[](image_sampler_data *sampler)
583     {
584         return mTable[(int)sampler->addressing_mode - CL_ADDRESS_NONE]
585                      [(int)sampler->filter_mode - CL_FILTER_NEAREST];
586     }
587 
588     AddressFn mTable[6][2];
589 };
590 
591 static AddressingTable sAddressingTable;
592 
is_sRGBA_order(cl_channel_order image_channel_order)593 bool is_sRGBA_order(cl_channel_order image_channel_order)
594 {
595     switch (image_channel_order)
596     {
597         case CL_sRGB:
598         case CL_sRGBx:
599         case CL_sRGBA:
600         case CL_sBGRA: return true;
601         default: return false;
602     }
603 }
604 
605 // Format helpers
606 
has_alpha(const cl_image_format * format)607 int has_alpha(const cl_image_format *format)
608 {
609     switch (format->image_channel_order)
610     {
611         case CL_R: return 0;
612         case CL_A: return 1;
613         case CL_Rx: return 0;
614         case CL_RG: return 0;
615         case CL_RA: return 1;
616         case CL_RGx: return 0;
617         case CL_RGB:
618         case CL_sRGB: return 0;
619         case CL_RGBx:
620         case CL_sRGBx: return 0;
621         case CL_RGBA: return 1;
622         case CL_BGRA: return 1;
623         case CL_ARGB: return 1;
624         case CL_ABGR: return 1;
625         case CL_INTENSITY: return 1;
626         case CL_LUMINANCE: return 0;
627 #ifdef CL_BGR1_APPLE
628         case CL_BGR1_APPLE: return 1;
629 #endif
630 #ifdef CL_1RGB_APPLE
631         case CL_1RGB_APPLE: return 1;
632 #endif
633         case CL_sRGBA:
634         case CL_sBGRA: return 1;
635         case CL_DEPTH: return 0;
636         default:
637             log_error("Invalid image channel order: %d\n",
638                       format->image_channel_order);
639             return 0;
640     }
641 }
642 
643 #define PRINT_MAX_SIZE_LOGIC 0
644 
645 #define SWAP(_a, _b)                                                           \
646     do                                                                         \
647     {                                                                          \
648         _a ^= _b;                                                              \
649         _b ^= _a;                                                              \
650         _a ^= _b;                                                              \
651     } while (0)
652 #ifndef MAX
653 #define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
654 #endif
655 
get_max_sizes(size_t * numberOfSizes,const int maxNumberOfSizes,size_t sizes[][3],size_t maxWidth,size_t maxHeight,size_t maxDepth,size_t maxArraySize,const cl_ulong maxIndividualAllocSize,const cl_ulong maxTotalAllocSize,cl_mem_object_type image_type,const cl_image_format * format,int usingMaxPixelSizeBuffer)656 void get_max_sizes(
657     size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3],
658     size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
659     const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE
660     const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE
661     cl_mem_object_type image_type, const cl_image_format *format,
662     int usingMaxPixelSizeBuffer)
663 {
664 
665     bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
666     bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
667                     || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);
668 
669     // Validate we have a reasonable max depth for 3D
670     if (is3D && maxDepth < 2)
671     {
672         log_error("ERROR: Requesting max image sizes for 3D images when max "
673                   "depth is < 2.\n");
674         *numberOfSizes = 0;
675         return;
676     }
677     // Validate we have a reasonable max array size for 1D & 2D image arrays
678     if (isArray && maxArraySize < 2)
679     {
680         log_error("ERROR: Requesting max image sizes for an image array when "
681                   "max array size is < 1.\n");
682         *numberOfSizes = 0;
683         return;
684     }
685 
686     // Reduce the maximum because we are trying to test the max image
687     // dimensions, not the memory allocation
688     cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
689     cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
690     log_info("Note: max individual allocation adjusted down from %gMB to %gMB "
691              "and max total allocation adjusted down from %gMB to %gMB.\n",
692              maxIndividualAllocSize / (1024.0 * 1024.0),
693              adjustedMaxIndividualAllocSize / (1024.0 * 1024.0),
694              maxTotalAllocSize / (1024.0 * 1024.0),
695              adjustedMaxTotalAllocSize / (1024.0 * 1024.0));
696 
697     // Cap our max allocation to 1.0GB.
698     // FIXME -- why?  In the interest of not taking a long time?  We should
699     // still test this stuff...
700     if (adjustedMaxTotalAllocSize > (cl_ulong)1024 * 1024 * 1024)
701     {
702         adjustedMaxTotalAllocSize = (cl_ulong)1024 * 1024 * 1024;
703         log_info("Limiting max total allocation size to %gMB (down from %gMB) "
704                  "for test.\n",
705                  adjustedMaxTotalAllocSize / (1024.0 * 1024.0),
706                  maxTotalAllocSize / (1024.0 * 1024.0));
707     }
708 
709     cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
710     if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize * 2)
711         maxAllocSize = adjustedMaxTotalAllocSize / 2;
712 
713     size_t raw_pixel_size = get_pixel_size(format);
714     // If the test will be creating input (src) buffer of type int4 or float4,
715     // number of pixels will be governed by sizeof(int4 or float4) and not
716     // sizeof(dest fomat) Also if pixel size is 12 bytes i.e. RGB or RGBx, we
717     // adjust it to 16 bytes as GPUs has no concept of 3 channel images. GPUs
718     // expand these to four channel RGBA.
719     if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16;
720     size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
721 
722     log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, "
723              "per-allocation limit %gMB.\n",
724              maxWidth, maxHeight, isArray ? maxArraySize : maxDepth,
725              raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0)));
726 
727     // Keep track of the maximum sizes for each dimension
728     size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };
729 
730     switch (image_type)
731     {
732         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
733             maximum_sizes[1] = maxArraySize;
734             maximum_sizes[2] = 1;
735             break;
736         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
737             maximum_sizes[2] = maxArraySize;
738             break;
739     }
740 
741 
742         // Given one fixed sized dimension, this code finds one or two other
743         // dimensions, both with very small size, such that the size does not
744         // exceed the maximum passed to this function
745 
746 #if defined(__x86_64) || defined(__arm64__) || defined(__ppc64__)
747     size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15 };
748 #else
749     size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13 };
750 #endif
751 
752     static size_t other_size = 0;
753     enum
754     {
755         num_other_sizes = sizeof(other_sizes) / sizeof(size_t)
756     };
757 
758     (*numberOfSizes) = 0;
759 
760     if (image_type == CL_MEM_OBJECT_IMAGE1D)
761     {
762 
763         double M = maximum_sizes[0];
764 
765         // Store the size
766         sizes[(*numberOfSizes)][0] = (size_t)M;
767         sizes[(*numberOfSizes)][1] = 1;
768         sizes[(*numberOfSizes)][2] = 1;
769         ++(*numberOfSizes);
770     }
771 
772     else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
773              || image_type == CL_MEM_OBJECT_IMAGE2D)
774     {
775 
776         for (int fixed_dim = 0; fixed_dim < 2; ++fixed_dim)
777         {
778 
779             // Determine the size of the fixed dimension
780             double M = maximum_sizes[fixed_dim];
781             double A = max_pixels;
782 
783             int x0_dim = !fixed_dim;
784             double x0 =
785                 fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M),
786                      maximum_sizes[x0_dim]);
787 
788             // Store the size
789             sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
790             sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
791             sizes[(*numberOfSizes)][2] = 1;
792             ++(*numberOfSizes);
793         }
794     }
795 
796     else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
797              || image_type == CL_MEM_OBJECT_IMAGE3D)
798     {
799 
800         // Iterate over dimensions, finding sizes for the non-fixed dimension
801         for (int fixed_dim = 0; fixed_dim < 3; ++fixed_dim)
802         {
803 
804             // Determine the size of the fixed dimension
805             double M = maximum_sizes[fixed_dim];
806             double A = max_pixels;
807 
808             // Find two other dimensions, x0 and x1
809             int x0_dim = (fixed_dim == 0) ? 1 : 0;
810             int x1_dim = (fixed_dim == 2) ? 1 : 2;
811 
812             // Choose two other sizes for these dimensions
813             double x0 = fmin(fmin(A / M, maximum_sizes[x0_dim]),
814                              other_sizes[(other_size++) % num_other_sizes]);
815             // GPUs have certain restrictions on minimum width (row alignment)
816             // of images which has given us issues testing small widths in this
817             // test (say we set width to 3 for testing, and compute size based
818             // on this width and decide it fits within vram ... but GPU driver
819             // decides that, due to row alignment requirements, it has to use
820             // width of 16 which doesnt fit in vram). For this purpose we are
821             // not testing width < 16 for this test.
822             if (x0_dim == 0 && x0 < 16) x0 = 16;
823             double x1 = fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
824                              other_sizes[(other_size++) % num_other_sizes]);
825 
826             // Valid image sizes cannot be below 1. Due to the workaround for
827             // the xo_dim where x0 is overidden to 16 there might not be enough
828             // space left for x1 dimension. This could be a fractional 0.x size
829             // that when cast to integer would result in a value 0. In these
830             // cases we clamp the size to a minimum of 1.
831             if (x1 < 1) x1 = 1;
832 
833             // M and x0 cannot be '0' as they derive from clDeviceInfo calls
834             assert(x0 > 0 && M > 0);
835 
836             // Store the size
837             sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
838             sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
839             sizes[(*numberOfSizes)][x1_dim] = (size_t)x1;
840             ++(*numberOfSizes);
841         }
842     }
843 
844     // Log the results
845     for (int j = 0; j < (int)(*numberOfSizes); j++)
846     {
847         switch (image_type)
848         {
849             case CL_MEM_OBJECT_IMAGE1D:
850                 log_info(" size[%d] = [%ld] (%g MB image)\n", j, sizes[j][0],
851                          raw_pixel_size * sizes[j][0] * sizes[j][1]
852                              * sizes[j][2] / (1024.0 * 1024.0));
853                 break;
854             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
855             case CL_MEM_OBJECT_IMAGE2D:
856                 log_info(" size[%d] = [%ld %ld] (%g MB image)\n", j,
857                          sizes[j][0], sizes[j][1],
858                          raw_pixel_size * sizes[j][0] * sizes[j][1]
859                              * sizes[j][2] / (1024.0 * 1024.0));
860                 break;
861             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
862             case CL_MEM_OBJECT_IMAGE3D:
863                 log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n", j,
864                          sizes[j][0], sizes[j][1], sizes[j][2],
865                          raw_pixel_size * sizes[j][0] * sizes[j][1]
866                              * sizes[j][2] / (1024.0 * 1024.0));
867                 break;
868         }
869     }
870 }
871 
get_max_absolute_error(const cl_image_format * format,image_sampler_data * sampler)872 float get_max_absolute_error(const cl_image_format *format,
873                              image_sampler_data *sampler)
874 {
875     if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f;
876 
877     switch (format->image_channel_data_type)
878     {
879         case CL_SNORM_INT8: return 1.0f / 127.0f;
880         case CL_UNORM_INT8: return 1.0f / 255.0f;
881         case CL_UNORM_INT16: return 1.0f / 65535.0f;
882         case CL_SNORM_INT16: return 1.0f / 32767.0f;
883         case CL_FLOAT: return CL_FLT_MIN;
884 #ifdef CL_SFIXED14_APPLE
885         case CL_SFIXED14_APPLE: return 0x1.0p-14f;
886 #endif
887         default: return 0.0f;
888     }
889 }
890 
get_max_relative_error(const cl_image_format * format,image_sampler_data * sampler,int is3D,int isLinearFilter)891 float get_max_relative_error(const cl_image_format *format,
892                              image_sampler_data *sampler, int is3D,
893                              int isLinearFilter)
894 {
895     float maxError = 0.0f;
896     float sampleCount = 1.0f;
897     if (isLinearFilter) sampleCount = is3D ? 8.0f : 4.0f;
898 
899     // Note that the ULP is defined here as the unit in the last place of the
900     // maximum magnitude sample used for filtering.
901 
902     // Section 8.3
903     switch (format->image_channel_data_type)
904     {
905         // The spec allows 2 ulps of error for normalized formats
906         case CL_SNORM_INT8:
907         case CL_UNORM_INT8:
908         case CL_SNORM_INT16:
909         case CL_UNORM_INT16:
910         case CL_UNORM_SHORT_565:
911         case CL_UNORM_SHORT_555:
912         case CL_UNORM_INT_101010:
913             // Maximum sampling error for round to zero normalization based on
914             // multiplication by reciprocal (using reciprocal generated in
915             // round to +inf mode, so that 1.0 matches spec)
916             maxError = 2 * FLT_EPSILON * sampleCount;
917             break;
918 
919             // If the implementation supports these formats then it will have to
920             // allow rounding error here too, because not all 32-bit ints are
921             // exactly representable in float
922         case CL_SIGNED_INT32:
923         case CL_UNSIGNED_INT32: maxError = 1 * FLT_EPSILON; break;
924     }
925 
926 
927     // Section 8.2
928     if (sampler->addressing_mode == CL_ADDRESS_REPEAT
929         || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT
930         || sampler->filter_mode != CL_FILTER_NEAREST
931         || sampler->normalized_coords)
932 #if defined(__APPLE__)
933     {
934         if (sampler->filter_mode != CL_FILTER_NEAREST)
935         {
936             // The maximum
937             if (gDeviceType == CL_DEVICE_TYPE_GPU)
938                 // Some GPUs ain't so accurate
939                 maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);
940             else
941                 // The standard method of 2d linear filtering delivers 4.0 ulps
942                 // of error in round to nearest (8 in rtz).
943                 maxError += 4.0f * FLT_EPSILON;
944         }
945         else
946             // normalized coordinates will introduce some error into the
947             // fractional part of the address, affecting results
948             maxError += 4.0f * FLT_EPSILON;
949     }
950 #else
951     {
952 #if !defined(_WIN32)
953 #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
954 #endif
955         // The spec allows linear filters to return any result most of the time.
956         // That's fine for implementations but a problem for testing. After all
957         // users aren't going to like garbage images.  We have "picked a number"
958         // here that we are going to attempt to conform to. Implementations are
959         // free to pick another number, like infinity, if they like.
960         // We picked a number for you, to provide /some/ sanity
961         maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
962         // ...but this is what the spec allows:
963         // maxError = INFINITY;
964         // Please feel free to pick any positive number. (NaN wont work.)
965     }
966 #endif
967 
968     // The error calculation itself can introduce error
969     maxError += FLT_EPSILON * 2;
970 
971     return maxError;
972 }
973 
get_format_max_int(const cl_image_format * format)974 size_t get_format_max_int(const cl_image_format *format)
975 {
976     switch (format->image_channel_data_type)
977     {
978         case CL_SNORM_INT8:
979         case CL_SIGNED_INT8: return 127;
980         case CL_UNORM_INT8:
981         case CL_UNSIGNED_INT8: return 255;
982 
983         case CL_SNORM_INT16:
984         case CL_SIGNED_INT16: return 32767;
985 
986         case CL_UNORM_INT16:
987         case CL_UNSIGNED_INT16: return 65535;
988 
989         case CL_SIGNED_INT32: return 2147483647L;
990 
991         case CL_UNSIGNED_INT32: return 4294967295LL;
992 
993         case CL_UNORM_SHORT_565:
994         case CL_UNORM_SHORT_555: return 31;
995 
996         case CL_UNORM_INT_101010: return 1023;
997 
998         case CL_HALF_FLOAT: return 1 << 10;
999 
1000 #ifdef CL_SFIXED14_APPLE
1001         case CL_SFIXED14_APPLE: return 16384;
1002 #endif
1003         default: return 0;
1004     }
1005 }
1006 
get_format_min_int(const cl_image_format * format)1007 int get_format_min_int(const cl_image_format *format)
1008 {
1009     switch (format->image_channel_data_type)
1010     {
1011         case CL_SNORM_INT8:
1012         case CL_SIGNED_INT8: return -128;
1013         case CL_UNORM_INT8:
1014         case CL_UNSIGNED_INT8: return 0;
1015 
1016         case CL_SNORM_INT16:
1017         case CL_SIGNED_INT16: return -32768;
1018 
1019         case CL_UNORM_INT16:
1020         case CL_UNSIGNED_INT16: return 0;
1021 
1022         case CL_SIGNED_INT32: return -2147483648LL;
1023 
1024         case CL_UNSIGNED_INT32: return 0;
1025 
1026         case CL_UNORM_SHORT_565:
1027         case CL_UNORM_SHORT_555:
1028         case CL_UNORM_INT_101010: return 0;
1029 
1030         case CL_HALF_FLOAT: return -(1 << 10);
1031 
1032 #ifdef CL_SFIXED14_APPLE
1033         case CL_SFIXED14_APPLE: return -16384;
1034 #endif
1035 
1036         default: return 0;
1037     }
1038 }
1039 
convert_float_to_half(float f)1040 cl_half convert_float_to_half(float f)
1041 {
1042     switch (gFloatToHalfRoundingMode)
1043     {
1044         case kRoundToNearestEven: return cl_half_from_float(f, CL_HALF_RTE);
1045         case kRoundTowardZero: return cl_half_from_float(f, CL_HALF_RTZ);
1046         default:
1047             log_error("ERROR: Test internal error -- unhandled or unknown "
1048                       "float->half rounding mode.\n");
1049             exit(-1);
1050             return 0xffff;
1051     }
1052 }
1053 
get_image_size(image_descriptor const * imageInfo)1054 cl_ulong get_image_size(image_descriptor const *imageInfo)
1055 {
1056     cl_ulong imageSize;
1057 
1058     // Assumes rowPitch and slicePitch are always correctly defined
1059     if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
1060     {
1061         imageSize = (size_t)compute_mipmapped_image_size(*imageInfo);
1062     }
1063     else
1064     {
1065         switch (imageInfo->type)
1066         {
1067             case CL_MEM_OBJECT_IMAGE1D: imageSize = imageInfo->rowPitch; break;
1068             case CL_MEM_OBJECT_IMAGE2D:
1069                 imageSize = imageInfo->height * imageInfo->rowPitch;
1070                 break;
1071             case CL_MEM_OBJECT_IMAGE3D:
1072                 imageSize = imageInfo->depth * imageInfo->slicePitch;
1073                 break;
1074             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1075                 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1076                 break;
1077             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1078                 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1079                 break;
1080             default:
1081                 log_error("ERROR: Cannot identify image type %x\n",
1082                           imageInfo->type);
1083                 abort();
1084         }
1085     }
1086     return imageSize;
1087 }
1088 
1089 // Calculate image size in megabytes (strictly, mebibytes). Result is rounded
1090 // up.
get_image_size_mb(image_descriptor const * imageInfo)1091 cl_ulong get_image_size_mb(image_descriptor const *imageInfo)
1092 {
1093     cl_ulong imageSize = get_image_size(imageInfo);
1094     cl_ulong mb = imageSize / (1024 * 1024);
1095     if (imageSize % (1024 * 1024) > 0)
1096     {
1097         mb += 1;
1098     }
1099     return mb;
1100 }
1101 
1102 
1103 uint64_t gRoundingStartValue = 0;
1104 
1105 
escape_inf_nan_values(char * data,size_t allocSize)1106 void escape_inf_nan_values(char *data, size_t allocSize)
1107 {
1108     // filter values with 8 not-quite-highest bits
1109     unsigned int *intPtr = (unsigned int *)data;
1110     for (size_t i = 0; i<allocSize>> 2; i++)
1111     {
1112         if ((intPtr[i] & 0x7F800000) == 0x7F800000) intPtr[i] ^= 0x40000000;
1113     }
1114 
1115     // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits
1116     // = 0x7C00 are special)
1117     unsigned short *shortPtr = (unsigned short *)data;
1118     for (size_t i = 0; i<allocSize>> 1; i++)
1119     {
1120         if ((shortPtr[i] & 0x7C00) == 0x7C00) shortPtr[i] ^= 0x4000;
1121     }
1122 }
1123 
generate_random_image_data(image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d)1124 char *generate_random_image_data(image_descriptor *imageInfo,
1125                                  BufferOwningPtr<char> &P, MTdata d)
1126 {
1127     size_t allocSize = get_image_size(imageInfo);
1128     size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format);
1129     size_t i;
1130 
1131     if (imageInfo->num_mip_levels > 1)
1132         allocSize = compute_mipmapped_image_size(*imageInfo);
1133 
1134 #if defined(__APPLE__)
1135     char *data = NULL;
1136     if (gDeviceType == CL_DEVICE_TYPE_CPU)
1137     {
1138         size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;
1139 
1140         void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
1141                          MAP_ANON | MAP_PRIVATE, 0, 0);
1142         intptr_t data_end = (intptr_t)map + mapSize - 4096;
1143         data = (char *)(data_end - (intptr_t)allocSize);
1144 
1145         mprotect(map, 4096, PROT_NONE);
1146         mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
1147         P.reset(data, map, mapSize, allocSize);
1148     }
1149     else
1150     {
1151         data = (char *)malloc(allocSize);
1152         P.reset(data, NULL, 0, allocSize);
1153     }
1154 #else
1155     P.reset(NULL); // Free already allocated memory first, then try to allocate
1156                    // new block.
1157     char *data =
1158         (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
1159     P.reset(data, NULL, 0, allocSize, true);
1160 #endif
1161 
1162     if (data == NULL)
1163     {
1164         log_error("ERROR: Unable to malloc %lu bytes for "
1165                   "generate_random_image_data\n",
1166                   allocSize);
1167         return 0;
1168     }
1169 
1170     if (gTestRounding)
1171     {
1172         // Special case: fill with a ramp from 0 to the size of the type
1173         size_t typeSize = get_format_type_size(imageInfo->format);
1174         switch (typeSize)
1175         {
1176             case 1: {
1177                 char *ptr = data;
1178                 for (i = 0; i < allocSize; i++)
1179                     ptr[i] = (cl_char)(i + gRoundingStartValue);
1180             }
1181             break;
1182             case 2: {
1183                 cl_short *ptr = (cl_short *)data;
1184                 for (i = 0; i < allocSize / 2; i++)
1185                     ptr[i] = (cl_short)(i + gRoundingStartValue);
1186             }
1187             break;
1188             case 4: {
1189                 cl_int *ptr = (cl_int *)data;
1190                 for (i = 0; i < allocSize / 4; i++)
1191                     ptr[i] = (cl_int)(i + gRoundingStartValue);
1192             }
1193             break;
1194         }
1195 
1196         // Note: inf or nan float values would cause problems, although we don't
1197         // know this will actually be a float, so we just know what to look for
1198         escape_inf_nan_values(data, allocSize);
1199         return data;
1200     }
1201 
1202     // Otherwise, we should be able to just fill with random bits no matter what
1203     cl_uint *p = (cl_uint *)data;
1204     for (i = 0; i + 4 <= allocSize; i += 4) p[i / 4] = genrand_int32(d);
1205 
1206     for (; i < allocSize; i++) data[i] = genrand_int32(d);
1207 
1208     // Note: inf or nan float values would cause problems, although we don't
1209     // know this will actually be a float, so we just know what to look for
1210     escape_inf_nan_values(data, allocSize);
1211 
1212     if (/*!gTestMipmaps*/ imageInfo->num_mip_levels < 2)
1213     {
1214         // Fill unused edges with -1, NaN for float
1215         if (imageInfo->rowPitch > pixelRowBytes)
1216         {
1217             size_t height = 0;
1218 
1219             switch (imageInfo->type)
1220             {
1221                 case CL_MEM_OBJECT_IMAGE2D:
1222                 case CL_MEM_OBJECT_IMAGE3D:
1223                 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1224                     height = imageInfo->height;
1225                     break;
1226                 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1227                     height = imageInfo->arraySize;
1228                     break;
1229             }
1230 
1231             // Fill in the row padding regions
1232             for (i = 0; i < height; i++)
1233             {
1234                 size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
1235                 size_t length = imageInfo->rowPitch - pixelRowBytes;
1236                 memset(data + offset, 0xff, length);
1237             }
1238         }
1239 
1240         // Fill in the slice padding regions, if necessary:
1241 
1242         size_t slice_dimension = imageInfo->height;
1243         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1244         {
1245             slice_dimension = imageInfo->arraySize;
1246         }
1247 
1248         if (imageInfo->slicePitch > slice_dimension * imageInfo->rowPitch)
1249         {
1250             size_t depth = 0;
1251             switch (imageInfo->type)
1252             {
1253                 case CL_MEM_OBJECT_IMAGE2D:
1254                 case CL_MEM_OBJECT_IMAGE3D: depth = imageInfo->depth; break;
1255                 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1256                 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1257                     depth = imageInfo->arraySize;
1258                     break;
1259             }
1260 
1261             for (i = 0; i < depth; i++)
1262             {
1263                 size_t offset = i * imageInfo->slicePitch
1264                     + slice_dimension * imageInfo->rowPitch;
1265                 size_t length = imageInfo->slicePitch
1266                     - slice_dimension * imageInfo->rowPitch;
1267                 memset(data + offset, 0xff, length);
1268             }
1269         }
1270     }
1271 
1272     return data;
1273 }
1274 
1275 #define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f))
1276 
1277 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData,int lod)1278 void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
1279                             int y, int z, float *outData, int lod)
1280 {
1281     size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
1282            depth_lod = imageInfo->depth;
1283     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1284 
1285     if (imageInfo->num_mip_levels > 1)
1286     {
1287         switch (imageInfo->type)
1288         {
1289             case CL_MEM_OBJECT_IMAGE3D:
1290                 depth_lod =
1291                     (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
1292             case CL_MEM_OBJECT_IMAGE2D:
1293             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1294                 height_lod =
1295                     (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
1296             default:
1297                 width_lod =
1298                     (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
1299         }
1300         row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1301         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1302             slice_pitch_lod = row_pitch_lod;
1303         else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
1304                  || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1305             slice_pitch_lod = row_pitch_lod * height_lod;
1306     }
1307     else
1308     {
1309         row_pitch_lod = imageInfo->rowPitch;
1310         slice_pitch_lod = imageInfo->slicePitch;
1311     }
1312     if (x < 0 || y < 0 || z < 0 || x >= (int)width_lod
1313         || (height_lod != 0 && y >= (int)height_lod)
1314         || (depth_lod != 0 && z >= (int)depth_lod)
1315         || (imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize))
1316     {
1317         outData[0] = outData[1] = outData[2] = outData[3] = 0;
1318         if (!has_alpha(imageInfo->format)) outData[3] = 1;
1319         return;
1320     }
1321 
1322     const cl_image_format *format = imageInfo->format;
1323 
1324     unsigned int i;
1325     float tempData[4];
1326 
1327     // Advance to the right spot
1328     char *ptr = (char *)imageData;
1329     size_t pixelSize = get_pixel_size(format);
1330 
1331     ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize;
1332 
1333     // OpenCL only supports reading floats from certain formats
1334     size_t channelCount = get_format_channel_count(format);
1335     switch (format->image_channel_data_type)
1336     {
1337         case CL_SNORM_INT8: {
1338             cl_char *dPtr = (cl_char *)ptr;
1339             for (i = 0; i < channelCount; i++)
1340                 tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 127.0f);
1341             break;
1342         }
1343 
1344         case CL_UNORM_INT8: {
1345             unsigned char *dPtr = (unsigned char *)ptr;
1346             for (i = 0; i < channelCount; i++)
1347             {
1348                 if ((is_sRGBA_order(imageInfo->format->image_channel_order))
1349                     && i < 3) // only RGB need to be converted for sRGBA
1350                     tempData[i] = (float)sRGBunmap((float)dPtr[i] / 255.0f);
1351                 else
1352                     tempData[i] = (float)dPtr[i] / 255.0f;
1353             }
1354             break;
1355         }
1356 
1357         case CL_SIGNED_INT8: {
1358             cl_char *dPtr = (cl_char *)ptr;
1359             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1360             break;
1361         }
1362 
1363         case CL_UNSIGNED_INT8: {
1364             cl_uchar *dPtr = (cl_uchar *)ptr;
1365             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1366             break;
1367         }
1368 
1369         case CL_SNORM_INT16: {
1370             cl_short *dPtr = (cl_short *)ptr;
1371             for (i = 0; i < channelCount; i++)
1372                 tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 32767.0f);
1373             break;
1374         }
1375 
1376         case CL_UNORM_INT16: {
1377             cl_ushort *dPtr = (cl_ushort *)ptr;
1378             for (i = 0; i < channelCount; i++)
1379                 tempData[i] = (float)dPtr[i] / 65535.0f;
1380             break;
1381         }
1382 
1383         case CL_SIGNED_INT16: {
1384             cl_short *dPtr = (cl_short *)ptr;
1385             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1386             break;
1387         }
1388 
1389         case CL_UNSIGNED_INT16: {
1390             cl_ushort *dPtr = (cl_ushort *)ptr;
1391             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1392             break;
1393         }
1394 
1395         case CL_HALF_FLOAT: {
1396             cl_half *dPtr = (cl_half *)ptr;
1397             for (i = 0; i < channelCount; i++)
1398                 tempData[i] = cl_half_to_float(dPtr[i]);
1399             break;
1400         }
1401 
1402         case CL_SIGNED_INT32: {
1403             cl_int *dPtr = (cl_int *)ptr;
1404             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1405             break;
1406         }
1407 
1408         case CL_UNSIGNED_INT32: {
1409             cl_uint *dPtr = (cl_uint *)ptr;
1410             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1411             break;
1412         }
1413 
1414         case CL_UNORM_SHORT_565: {
1415             cl_ushort *dPtr = (cl_ushort *)ptr;
1416             tempData[0] = (float)(dPtr[0] >> 11) / (float)31;
1417             tempData[1] = (float)((dPtr[0] >> 5) & 63) / (float)63;
1418             tempData[2] = (float)(dPtr[0] & 31) / (float)31;
1419             break;
1420         }
1421 
1422         case CL_UNORM_SHORT_555: {
1423             cl_ushort *dPtr = (cl_ushort *)ptr;
1424             tempData[0] = (float)((dPtr[0] >> 10) & 31) / (float)31;
1425             tempData[1] = (float)((dPtr[0] >> 5) & 31) / (float)31;
1426             tempData[2] = (float)(dPtr[0] & 31) / (float)31;
1427             break;
1428         }
1429 
1430         case CL_UNORM_INT_101010: {
1431             cl_uint *dPtr = (cl_uint *)ptr;
1432             tempData[0] = (float)((dPtr[0] >> 20) & 0x3ff) / (float)1023;
1433             tempData[1] = (float)((dPtr[0] >> 10) & 0x3ff) / (float)1023;
1434             tempData[2] = (float)(dPtr[0] & 0x3ff) / (float)1023;
1435             break;
1436         }
1437 
1438         case CL_FLOAT: {
1439             float *dPtr = (float *)ptr;
1440             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1441             break;
1442         }
1443 #ifdef CL_SFIXED14_APPLE
1444         case CL_SFIXED14_APPLE: {
1445             cl_ushort *dPtr = (cl_ushort *)ptr;
1446             for (i = 0; i < channelCount; i++)
1447                 tempData[i] = ((int)dPtr[i] - 16384) * 0x1.0p-14f;
1448             break;
1449         }
1450 #endif
1451     }
1452 
1453 
1454     outData[0] = outData[1] = outData[2] = 0;
1455     outData[3] = 1;
1456 
1457     switch (format->image_channel_order)
1458     {
1459         case CL_A: outData[3] = tempData[0]; break;
1460         case CL_R:
1461         case CL_Rx: outData[0] = tempData[0]; break;
1462         case CL_RA:
1463             outData[0] = tempData[0];
1464             outData[3] = tempData[1];
1465             break;
1466         case CL_RG:
1467         case CL_RGx:
1468             outData[0] = tempData[0];
1469             outData[1] = tempData[1];
1470             break;
1471         case CL_RGB:
1472         case CL_RGBx:
1473         case CL_sRGB:
1474         case CL_sRGBx:
1475             outData[0] = tempData[0];
1476             outData[1] = tempData[1];
1477             outData[2] = tempData[2];
1478             break;
1479         case CL_RGBA:
1480             outData[0] = tempData[0];
1481             outData[1] = tempData[1];
1482             outData[2] = tempData[2];
1483             outData[3] = tempData[3];
1484             break;
1485         case CL_ARGB:
1486             outData[0] = tempData[1];
1487             outData[1] = tempData[2];
1488             outData[2] = tempData[3];
1489             outData[3] = tempData[0];
1490             break;
1491         case CL_ABGR:
1492             outData[0] = tempData[3];
1493             outData[1] = tempData[2];
1494             outData[2] = tempData[1];
1495             outData[3] = tempData[0];
1496             break;
1497         case CL_BGRA:
1498         case CL_sBGRA:
1499             outData[0] = tempData[2];
1500             outData[1] = tempData[1];
1501             outData[2] = tempData[0];
1502             outData[3] = tempData[3];
1503             break;
1504         case CL_INTENSITY:
1505             outData[0] = tempData[0];
1506             outData[1] = tempData[0];
1507             outData[2] = tempData[0];
1508             outData[3] = tempData[0];
1509             break;
1510         case CL_LUMINANCE:
1511             outData[0] = tempData[0];
1512             outData[1] = tempData[0];
1513             outData[2] = tempData[0];
1514             break;
1515 #ifdef CL_1RGB_APPLE
1516         case CL_1RGB_APPLE:
1517             outData[0] = tempData[1];
1518             outData[1] = tempData[2];
1519             outData[2] = tempData[3];
1520             outData[3] = 1.0f;
1521             break;
1522 #endif
1523 #ifdef CL_BGR1_APPLE
1524         case CL_BGR1_APPLE:
1525             outData[0] = tempData[2];
1526             outData[1] = tempData[1];
1527             outData[2] = tempData[0];
1528             outData[3] = 1.0f;
1529             break;
1530 #endif
1531         case CL_sRGBA:
1532             outData[0] = tempData[0];
1533             outData[1] = tempData[1];
1534             outData[2] = tempData[2];
1535             outData[3] = tempData[3];
1536             break;
1537         case CL_DEPTH: outData[0] = tempData[0]; break;
1538         default:
1539             log_error("Invalid format:");
1540             print_header(format, true);
1541             break;
1542     }
1543 }
1544 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData)1545 void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
1546                             int y, int z, float *outData)
1547 {
1548     read_image_pixel_float(imageData, imageInfo, x, y, z, outData, 0);
1549 }
1550 
get_integer_coords(float x,float y,float z,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1551 bool get_integer_coords(float x, float y, float z, size_t width, size_t height,
1552                         size_t depth, image_sampler_data *imageSampler,
1553                         image_descriptor *imageInfo, int &outX, int &outY,
1554                         int &outZ)
1555 {
1556     return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height,
1557                                      depth, imageSampler, imageInfo, outX, outY,
1558                                      outZ);
1559 }
1560 
get_integer_coords_offset(float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1561 bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset,
1562                                float yAddressOffset, float zAddressOffset,
1563                                size_t width, size_t height, size_t depth,
1564                                image_sampler_data *imageSampler,
1565                                image_descriptor *imageInfo, int &outX,
1566                                int &outY, int &outZ)
1567 {
1568     AddressFn adFn = sAddressingTable[imageSampler];
1569 
1570     float refX = floorf(x), refY = floorf(y), refZ = floorf(z);
1571 
1572     // Handle sampler-directed coordinate normalization + clamping.  Note that
1573     // the array coordinate for image array types is expected to be
1574     // unnormalized, and is clamped to 0..arraySize-1.
1575     if (imageSampler->normalized_coords)
1576     {
1577         switch (imageSampler->addressing_mode)
1578         {
1579             case CL_ADDRESS_REPEAT:
1580                 x = RepeatNormalizedAddressFn(x, width);
1581                 if (height != 0)
1582                 {
1583                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1584                         y = RepeatNormalizedAddressFn(y, height);
1585                 }
1586                 if (depth != 0)
1587                 {
1588                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1589                         z = RepeatNormalizedAddressFn(z, depth);
1590                 }
1591 
1592                 if (xAddressOffset != 0.0)
1593                 {
1594                     // Add in the offset
1595                     x += xAddressOffset;
1596                     // Handle wrapping
1597                     if (x > width) x -= (float)width;
1598                     if (x < 0) x += (float)width;
1599                 }
1600                 if ((yAddressOffset != 0.0)
1601                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
1602                 {
1603                     // Add in the offset
1604                     y += yAddressOffset;
1605                     // Handle wrapping
1606                     if (y > height) y -= (float)height;
1607                     if (y < 0) y += (float)height;
1608                 }
1609                 if ((zAddressOffset != 0.0)
1610                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
1611                 {
1612                     // Add in the offset
1613                     z += zAddressOffset;
1614                     // Handle wrapping
1615                     if (z > depth) z -= (float)depth;
1616                     if (z < 0) z += (float)depth;
1617                 }
1618                 break;
1619 
1620             case CL_ADDRESS_MIRRORED_REPEAT:
1621                 x = MirroredRepeatNormalizedAddressFn(x, width);
1622                 if (height != 0)
1623                 {
1624                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1625                         y = MirroredRepeatNormalizedAddressFn(y, height);
1626                 }
1627                 if (depth != 0)
1628                 {
1629                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1630                         z = MirroredRepeatNormalizedAddressFn(z, depth);
1631                 }
1632 
1633                 if (xAddressOffset != 0.0)
1634                 {
1635                     float temp = x + xAddressOffset;
1636                     if (temp > (float)width)
1637                         temp = (float)width - (temp - (float)width);
1638                     x = fabsf(temp);
1639                 }
1640                 if ((yAddressOffset != 0.0)
1641                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
1642                 {
1643                     float temp = y + yAddressOffset;
1644                     if (temp > (float)height)
1645                         temp = (float)height - (temp - (float)height);
1646                     y = fabsf(temp);
1647                 }
1648                 if ((zAddressOffset != 0.0)
1649                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
1650                 {
1651                     float temp = z + zAddressOffset;
1652                     if (temp > (float)depth)
1653                         temp = (float)depth - (temp - (float)depth);
1654                     z = fabsf(temp);
1655                 }
1656                 break;
1657 
1658             default:
1659                 // Also, remultiply to the original coords. This simulates any
1660                 // truncation in the pass to OpenCL
1661                 x *= (float)width;
1662                 x += xAddressOffset;
1663 
1664                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1665                 {
1666                     y *= (float)height;
1667                     y += yAddressOffset;
1668                 }
1669 
1670                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1671                 {
1672                     z *= (float)depth;
1673                     z += zAddressOffset;
1674                 }
1675                 break;
1676         }
1677     }
1678 
1679     // At this point, we're dealing with non-normalized coordinates.
1680 
1681     outX = adFn(floorf(x), width);
1682 
1683     // 1D and 2D arrays require special care for the index coordinate:
1684 
1685     switch (imageInfo->type)
1686     {
1687         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1688             outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
1689             outZ = 0.0f; /* don't care! */
1690             break;
1691         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1692             outY = adFn(floorf(y), height);
1693             outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
1694             break;
1695         default:
1696             // legacy path:
1697             if (height != 0) outY = adFn(floorf(y), height);
1698             if (depth != 0) outZ = adFn(floorf(z), depth);
1699     }
1700 
1701     return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ);
1702 }
1703 
frac(float a)1704 static float frac(float a) { return a - floorf(a); }
1705 
1706 static inline void pixelMax(const float a[4], const float b[4], float *results);
pixelMax(const float a[4],const float b[4],float * results)1707 static inline void pixelMax(const float a[4], const float b[4], float *results)
1708 {
1709     for (int i = 0; i < 4; i++) results[i] = errMax(fabsf(a[i]), fabsf(b[i]));
1710 }
1711 
1712 // If containsDenorms is NULL, flush denorms to zero
1713 // if containsDenorms is not NULL, record whether there are any denorms
1714 static inline void check_for_denorms(float a[4], int *containsDenorms);
check_for_denorms(float a[4],int * containsDenorms)1715 static inline void check_for_denorms(float a[4], int *containsDenorms)
1716 {
1717     if (NULL == containsDenorms)
1718     {
1719         for (int i = 0; i < 4; i++)
1720         {
1721             if (IsFloatSubnormal(a[i])) a[i] = copysignf(0.0f, a[i]);
1722         }
1723     }
1724     else
1725     {
1726         for (int i = 0; i < 4; i++)
1727         {
1728             if (IsFloatSubnormal(a[i]))
1729             {
1730                 *containsDenorms = 1;
1731                 break;
1732             }
1733         }
1734     }
1735 }
1736 
calculate_array_index(float coord,float extent)1737 inline float calculate_array_index(float coord, float extent)
1738 {
1739     // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
1740     //
1741     // given coordinate 'w' that represents an index:
1742     // layer_index = clamp( rint(w), 0, image_array_size - 1)
1743 
1744     float ret = rintf(coord);
1745     ret = ret > extent ? extent : ret;
1746     ret = ret < 0.0f ? 0.0f : ret;
1747 
1748     return ret;
1749 }
1750 
1751 /*
1752  * Utility function to unnormalized a coordinate given a particular sampler.
1753  *
1754  * name     - the name of the coordinate, used for verbose debugging only
1755  * coord    - the coordinate requiring unnormalization
1756  * offset   - an addressing offset to be added to the coordinate
1757  * extent   - the max value for this coordinate (e.g. width for x)
1758  */
unnormalize_coordinate(const char * name,float coord,float offset,float extent,cl_addressing_mode addressing_mode,int verbose)1759 static float unnormalize_coordinate(const char *name, float coord, float offset,
1760                                     float extent,
1761                                     cl_addressing_mode addressing_mode,
1762                                     int verbose)
1763 {
1764     float ret = 0.0f;
1765 
1766     switch (addressing_mode)
1767     {
1768         case CL_ADDRESS_REPEAT:
1769             ret = RepeatNormalizedAddressFn(coord, extent);
1770 
1771             if (verbose)
1772             {
1773                 log_info("\tRepeat filter denormalizes %s (%f) to %f\n", name,
1774                          coord, ret);
1775             }
1776 
1777             if (offset != 0.0)
1778             {
1779                 // Add in the offset, and handle wrapping.
1780                 ret += offset;
1781                 if (ret > extent) ret -= extent;
1782                 if (ret < 0.0) ret += extent;
1783             }
1784 
1785             if (verbose && offset != 0.0f)
1786             {
1787                 log_info("\tAddress offset of %f added to get %f\n", offset,
1788                          ret);
1789             }
1790             break;
1791 
1792         case CL_ADDRESS_MIRRORED_REPEAT:
1793             ret = MirroredRepeatNormalizedAddressFn(coord, extent);
1794 
1795             if (verbose)
1796             {
1797                 log_info(
1798                     "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
1799                     name, coord, ret);
1800             }
1801 
1802             if (offset != 0.0)
1803             {
1804                 float temp = ret + offset;
1805                 if (temp > extent) temp = extent - (temp - extent);
1806                 ret = fabsf(temp);
1807             }
1808 
1809             if (verbose && offset != 0.0f)
1810             {
1811                 log_info("\tAddress offset of %f added to get %f\n", offset,
1812                          ret);
1813             }
1814             break;
1815 
1816         default:
1817 
1818             ret = coord * extent;
1819 
1820             if (verbose)
1821             {
1822                 log_info("\tFilter denormalizes %s to %f (%f * %f)\n", name,
1823                          ret, coord, extent);
1824             }
1825 
1826             ret += offset;
1827 
1828             if (verbose && offset != 0.0f)
1829             {
1830                 log_info("\tAddress offset of %f added to get %f\n", offset,
1831                          ret);
1832             }
1833     }
1834 
1835     return ret;
1836 }
1837 
1838 FloatPixel
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)1839 sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x,
1840                          float y, float z, image_sampler_data *imageSampler,
1841                          float *outData, int verbose, int *containsDenorms)
1842 {
1843     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
1844                                            0.0f, 0.0f, imageSampler, outData,
1845                                            verbose, containsDenorms);
1846 }
1847 
1848 // returns max pixel value of the pixels touched
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1849 FloatPixel sample_image_pixel_float(void *imageData,
1850                                     image_descriptor *imageInfo, float x,
1851                                     float y, float z,
1852                                     image_sampler_data *imageSampler,
1853                                     float *outData, int verbose,
1854                                     int *containsDenorms, int lod)
1855 {
1856     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
1857                                            0.0f, 0.0f, imageSampler, outData,
1858                                            verbose, containsDenorms, lod);
1859 }
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1860 FloatPixel sample_image_pixel_float_offset(
1861     void *imageData, image_descriptor *imageInfo, float x, float y, float z,
1862     float xAddressOffset, float yAddressOffset, float zAddressOffset,
1863     image_sampler_data *imageSampler, float *outData, int verbose,
1864     int *containsDenorms, int lod)
1865 {
1866     AddressFn adFn = sAddressingTable[imageSampler];
1867     FloatPixel returnVal;
1868     size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
1869            depth_lod = imageInfo->depth;
1870     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1871 
1872     if (imageInfo->num_mip_levels > 1)
1873     {
1874         switch (imageInfo->type)
1875         {
1876             case CL_MEM_OBJECT_IMAGE3D:
1877                 depth_lod =
1878                     (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
1879             case CL_MEM_OBJECT_IMAGE2D:
1880             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1881                 height_lod =
1882                     (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
1883             default:
1884                 width_lod =
1885                     (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
1886         }
1887         row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1888         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1889             slice_pitch_lod = row_pitch_lod;
1890         else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
1891                  || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1892             slice_pitch_lod = row_pitch_lod * height_lod;
1893     }
1894     else
1895     {
1896         slice_pitch_lod = imageInfo->slicePitch;
1897         row_pitch_lod = imageInfo->rowPitch;
1898     }
1899 
1900     if (containsDenorms) *containsDenorms = 0;
1901 
1902     if (imageSampler->normalized_coords)
1903     {
1904 
1905         // We need to unnormalize our coordinates differently depending on
1906         // the image type, but 'x' is always processed the same way.
1907 
1908         x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod,
1909                                    imageSampler->addressing_mode, verbose);
1910 
1911         switch (imageInfo->type)
1912         {
1913 
1914                 // The image array types require special care:
1915 
1916             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1917                 z = 0; // don't care -- unused for 1D arrays
1918                 break;
1919 
1920             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1921                 y = unnormalize_coordinate(
1922                     "y", y, yAddressOffset, (float)height_lod,
1923                     imageSampler->addressing_mode, verbose);
1924                 break;
1925 
1926                 // Everybody else:
1927 
1928             default:
1929                 y = unnormalize_coordinate(
1930                     "y", y, yAddressOffset, (float)height_lod,
1931                     imageSampler->addressing_mode, verbose);
1932                 z = unnormalize_coordinate(
1933                     "z", z, zAddressOffset, (float)depth_lod,
1934                     imageSampler->addressing_mode, verbose);
1935         }
1936     }
1937     else if (verbose)
1938     {
1939 
1940         switch (imageInfo->type)
1941         {
1942             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1943                 log_info("Starting coordinate: %f, array index %f\n", x, y);
1944                 break;
1945             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1946                 log_info("Starting coordinate: %f, %f, array index %f\n", x, y,
1947                          z);
1948                 break;
1949             case CL_MEM_OBJECT_IMAGE1D:
1950             case CL_MEM_OBJECT_IMAGE1D_BUFFER:
1951                 log_info("Starting coordinate: %f\b", x);
1952                 break;
1953             case CL_MEM_OBJECT_IMAGE2D:
1954                 log_info("Starting coordinate: %f, %f\n", x, y);
1955                 break;
1956             case CL_MEM_OBJECT_IMAGE3D:
1957             default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
1958         }
1959     }
1960 
1961     // At this point, we have unnormalized coordinates.
1962 
1963     if (imageSampler->filter_mode == CL_FILTER_NEAREST)
1964     {
1965         int ix, iy, iz;
1966 
1967         // We apply the addressing function to the now-unnormalized
1968         // coordinates.  Note that the array cases again require special
1969         // care, per section 8.4 in the OpenCL 1.2 Specification.
1970 
1971         ix = adFn(floorf(x), width_lod);
1972 
1973         switch (imageInfo->type)
1974         {
1975             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1976                 iy =
1977                     calculate_array_index(y, (float)(imageInfo->arraySize - 1));
1978                 iz = 0;
1979                 if (verbose)
1980                 {
1981                     log_info("\tArray index %f evaluates to %d\n", y, iy);
1982                 }
1983                 break;
1984             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1985                 iy = adFn(floorf(y), height_lod);
1986                 iz =
1987                     calculate_array_index(z, (float)(imageInfo->arraySize - 1));
1988                 if (verbose)
1989                 {
1990                     log_info("\tArray index %f evaluates to %d\n", z, iz);
1991                 }
1992                 break;
1993             default:
1994                 iy = adFn(floorf(y), height_lod);
1995                 if (depth_lod != 0)
1996                     iz = adFn(floorf(z), depth_lod);
1997                 else
1998                     iz = 0;
1999         }
2000 
2001         if (verbose)
2002         {
2003             if (iz)
2004                 log_info(
2005                     "\tReference integer coords calculated: { %d, %d, %d }\n",
2006                     ix, iy, iz);
2007             else
2008                 log_info("\tReference integer coords calculated: { %d, %d }\n",
2009                          ix, iy);
2010         }
2011 
2012         read_image_pixel_float(imageData, imageInfo, ix, iy, iz, outData, lod);
2013         check_for_denorms(outData, containsDenorms);
2014         for (int i = 0; i < 4; i++) returnVal.p[i] = fabsf(outData[i]);
2015         return returnVal;
2016     }
2017     else
2018     {
2019         // Linear filtering cases.
2020 
2021         size_t width = width_lod, height = height_lod, depth = depth_lod;
2022 
2023         // Image arrays can use 2D filtering, but require us to walk into the
2024         // image a certain number of slices before reading.
2025 
2026         if (depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY
2027             || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2028         {
2029             float array_index = 0;
2030 
2031             size_t layer_offset = 0;
2032 
2033             if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
2034             {
2035                 array_index =
2036                     calculate_array_index(z, (float)(imageInfo->arraySize - 1));
2037                 layer_offset = slice_pitch_lod * (size_t)array_index;
2038             }
2039             else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2040             {
2041                 array_index =
2042                     calculate_array_index(y, (float)(imageInfo->arraySize - 1));
2043                 layer_offset = slice_pitch_lod * (size_t)array_index;
2044 
2045                 // Set up y and height so that the filtering below is correct
2046                 // 1D filtering on a single slice.
2047                 height = 1;
2048             }
2049 
2050             int x1 = adFn(floorf(x - 0.5f), width);
2051             int y1 = 0;
2052             int x2 = adFn(floorf(x - 0.5f) + 1, width);
2053             int y2 = 0;
2054             if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D)
2055                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
2056                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER))
2057             {
2058                 y1 = adFn(floorf(y - 0.5f), height);
2059                 y2 = adFn(floorf(y - 0.5f) + 1, height);
2060             }
2061             else
2062             {
2063                 y = 0.5f;
2064             }
2065 
2066             if (verbose)
2067             {
2068                 log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ "
2069                          "%d, %d } and i1:{ %d, %d }\n",
2070                          x1, y1, x2, y2);
2071                 log_info("\tArray coordinate is %f\n", array_index);
2072             }
2073 
2074             // Walk to beginning of the 'correct' slice, if needed.
2075             char *imgPtr = ((char *)imageData) + layer_offset;
2076 
2077             float upLeft[4], upRight[4], lowLeft[4], lowRight[4];
2078             float maxUp[4], maxLow[4];
2079             read_image_pixel_float(imgPtr, imageInfo, x1, y1, 0, upLeft, lod);
2080             read_image_pixel_float(imgPtr, imageInfo, x2, y1, 0, upRight, lod);
2081             check_for_denorms(upLeft, containsDenorms);
2082             check_for_denorms(upRight, containsDenorms);
2083             pixelMax(upLeft, upRight, maxUp);
2084             read_image_pixel_float(imgPtr, imageInfo, x1, y2, 0, lowLeft, lod);
2085             read_image_pixel_float(imgPtr, imageInfo, x2, y2, 0, lowRight, lod);
2086             check_for_denorms(lowLeft, containsDenorms);
2087             check_for_denorms(lowRight, containsDenorms);
2088             pixelMax(lowLeft, lowRight, maxLow);
2089             pixelMax(maxUp, maxLow, returnVal.p);
2090 
2091             if (verbose)
2092             {
2093                 if (NULL == containsDenorms)
2094                     log_info("\tSampled pixels (rgba order, denorms flushed to "
2095                              "zero):\n");
2096                 else
2097                     log_info("\tSampled pixels (rgba order):\n");
2098                 log_info("\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1],
2099                          upLeft[2], upLeft[3]);
2100                 log_info("\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1],
2101                          upRight[2], upRight[3]);
2102                 log_info("\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1],
2103                          lowLeft[2], lowLeft[3]);
2104                 log_info("\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1],
2105                          lowRight[2], lowRight[3]);
2106             }
2107 
2108             bool printMe = false;
2109             if (x1 <= 0 || x2 <= 0 || x1 >= (int)width - 1
2110                 || x2 >= (int)width - 1)
2111                 printMe = true;
2112             if (y1 <= 0 || y2 <= 0 || y1 >= (int)height - 1
2113                 || y2 >= (int)height - 1)
2114                 printMe = true;
2115 
2116             double weights[2][2];
2117 
2118             weights[0][0] = weights[0][1] = 1.0 - frac(x - 0.5f);
2119             weights[1][0] = weights[1][1] = frac(x - 0.5f);
2120             weights[0][0] *= 1.0 - frac(y - 0.5f);
2121             weights[1][0] *= 1.0 - frac(y - 0.5f);
2122             weights[0][1] *= frac(y - 0.5f);
2123             weights[1][1] *= frac(y - 0.5f);
2124 
2125             if (verbose)
2126                 log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",
2127                          frac(x - 0.5f), frac(y - 0.5f));
2128 
2129             for (int i = 0; i < 3; i++)
2130             {
2131                 outData[i] = (float)((upLeft[i] * weights[0][0])
2132                                      + (upRight[i] * weights[1][0])
2133                                      + (lowLeft[i] * weights[0][1])
2134                                      + (lowRight[i] * weights[1][1]));
2135                 // flush subnormal results to zero if necessary
2136                 if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
2137                     outData[i] = copysignf(0.0f, outData[i]);
2138             }
2139             outData[3] = (float)((upLeft[3] * weights[0][0])
2140                                  + (upRight[3] * weights[1][0])
2141                                  + (lowLeft[3] * weights[0][1])
2142                                  + (lowRight[3] * weights[1][1]));
2143             // flush subnormal results to zero if necessary
2144             if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
2145                 outData[3] = copysignf(0.0f, outData[3]);
2146         }
2147         else
2148         {
2149             // 3D linear filtering
2150             int x1 = adFn(floorf(x - 0.5f), width_lod);
2151             int y1 = adFn(floorf(y - 0.5f), height_lod);
2152             int z1 = adFn(floorf(z - 0.5f), depth_lod);
2153             int x2 = adFn(floorf(x - 0.5f) + 1, width_lod);
2154             int y2 = adFn(floorf(y - 0.5f) + 1, height_lod);
2155             int z2 = adFn(floorf(z - 0.5f) + 1, depth_lod);
2156 
2157             if (verbose)
2158                 log_info("\tActual integer coords used (i = floor(x-.5)): "
2159                          "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n",
2160                          x1, y1, z1, x2, y2, z2);
2161 
2162             float upLeftA[4], upRightA[4], lowLeftA[4], lowRightA[4];
2163             float upLeftB[4], upRightB[4], lowLeftB[4], lowRightB[4];
2164             float pixelMaxA[4], pixelMaxB[4];
2165             read_image_pixel_float(imageData, imageInfo, x1, y1, z1, upLeftA,
2166                                    lod);
2167             read_image_pixel_float(imageData, imageInfo, x2, y1, z1, upRightA,
2168                                    lod);
2169             check_for_denorms(upLeftA, containsDenorms);
2170             check_for_denorms(upRightA, containsDenorms);
2171             pixelMax(upLeftA, upRightA, pixelMaxA);
2172             read_image_pixel_float(imageData, imageInfo, x1, y2, z1, lowLeftA,
2173                                    lod);
2174             read_image_pixel_float(imageData, imageInfo, x2, y2, z1, lowRightA,
2175                                    lod);
2176             check_for_denorms(lowLeftA, containsDenorms);
2177             check_for_denorms(lowRightA, containsDenorms);
2178             pixelMax(lowLeftA, lowRightA, pixelMaxB);
2179             pixelMax(pixelMaxA, pixelMaxB, returnVal.p);
2180             read_image_pixel_float(imageData, imageInfo, x1, y1, z2, upLeftB,
2181                                    lod);
2182             read_image_pixel_float(imageData, imageInfo, x2, y1, z2, upRightB,
2183                                    lod);
2184             check_for_denorms(upLeftB, containsDenorms);
2185             check_for_denorms(upRightB, containsDenorms);
2186             pixelMax(upLeftB, upRightB, pixelMaxA);
2187             read_image_pixel_float(imageData, imageInfo, x1, y2, z2, lowLeftB,
2188                                    lod);
2189             read_image_pixel_float(imageData, imageInfo, x2, y2, z2, lowRightB,
2190                                    lod);
2191             check_for_denorms(lowLeftB, containsDenorms);
2192             check_for_denorms(lowRightB, containsDenorms);
2193             pixelMax(lowLeftB, lowRightB, pixelMaxB);
2194             pixelMax(pixelMaxA, pixelMaxB, pixelMaxA);
2195             pixelMax(pixelMaxA, returnVal.p, returnVal.p);
2196 
2197             if (verbose)
2198             {
2199                 if (NULL == containsDenorms)
2200                     log_info("\tSampled pixels (rgba order, denorms flushed to "
2201                              "zero):\n");
2202                 else
2203                     log_info("\tSampled pixels (rgba order):\n");
2204                 log_info("\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1],
2205                          upLeftA[2], upLeftA[3]);
2206                 log_info("\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1],
2207                          upRightA[2], upRightA[3]);
2208                 log_info("\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1],
2209                          lowLeftA[2], lowLeftA[3]);
2210                 log_info("\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0],
2211                          lowRightA[1], lowRightA[2], lowRightA[3]);
2212                 log_info("\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1],
2213                          upLeftB[2], upLeftB[3]);
2214                 log_info("\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1],
2215                          upRightB[2], upRightB[3]);
2216                 log_info("\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1],
2217                          lowLeftB[2], lowLeftB[3]);
2218                 log_info("\t\tp111: %f, %f, %f, %f\n", lowRightB[0],
2219                          lowRightB[1], lowRightB[2], lowRightB[3]);
2220             }
2221 
2222             double weights[2][2][2];
2223 
2224             float a = frac(x - 0.5f), b = frac(y - 0.5f), c = frac(z - 0.5f);
2225             weights[0][0][0] = weights[0][1][0] = weights[0][0][1] =
2226                 weights[0][1][1] = 1.f - a;
2227             weights[1][0][0] = weights[1][1][0] = weights[1][0][1] =
2228                 weights[1][1][1] = a;
2229             weights[0][0][0] *= 1.f - b;
2230             weights[1][0][0] *= 1.f - b;
2231             weights[0][0][1] *= 1.f - b;
2232             weights[1][0][1] *= 1.f - b;
2233             weights[0][1][0] *= b;
2234             weights[1][1][0] *= b;
2235             weights[0][1][1] *= b;
2236             weights[1][1][1] *= b;
2237             weights[0][0][0] *= 1.f - c;
2238             weights[0][1][0] *= 1.f - c;
2239             weights[1][0][0] *= 1.f - c;
2240             weights[1][1][0] *= 1.f - c;
2241             weights[0][0][1] *= c;
2242             weights[0][1][1] *= c;
2243             weights[1][0][1] *= c;
2244             weights[1][1][1] *= c;
2245 
2246             if (verbose)
2247                 log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, "
2248                          "frac( z - 0.5f ) = %f\n",
2249                          frac(x - 0.5f), frac(y - 0.5f), frac(z - 0.5f));
2250 
2251             for (int i = 0; i < 3; i++)
2252             {
2253                 outData[i] = (float)((upLeftA[i] * weights[0][0][0])
2254                                      + (upRightA[i] * weights[1][0][0])
2255                                      + (lowLeftA[i] * weights[0][1][0])
2256                                      + (lowRightA[i] * weights[1][1][0])
2257                                      + (upLeftB[i] * weights[0][0][1])
2258                                      + (upRightB[i] * weights[1][0][1])
2259                                      + (lowLeftB[i] * weights[0][1][1])
2260                                      + (lowRightB[i] * weights[1][1][1]));
2261                 // flush subnormal results to zero if necessary
2262                 if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
2263                     outData[i] = copysignf(0.0f, outData[i]);
2264             }
2265             outData[3] = (float)((upLeftA[3] * weights[0][0][0])
2266                                  + (upRightA[3] * weights[1][0][0])
2267                                  + (lowLeftA[3] * weights[0][1][0])
2268                                  + (lowRightA[3] * weights[1][1][0])
2269                                  + (upLeftB[3] * weights[0][0][1])
2270                                  + (upRightB[3] * weights[1][0][1])
2271                                  + (lowLeftB[3] * weights[0][1][1])
2272                                  + (lowRightB[3] * weights[1][1][1]));
2273             // flush subnormal results to zero if necessary
2274             if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
2275                 outData[3] = copysignf(0.0f, outData[3]);
2276         }
2277 
2278         return returnVal;
2279     }
2280 }
2281 
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)2282 FloatPixel sample_image_pixel_float_offset(
2283     void *imageData, image_descriptor *imageInfo, float x, float y, float z,
2284     float xAddressOffset, float yAddressOffset, float zAddressOffset,
2285     image_sampler_data *imageSampler, float *outData, int verbose,
2286     int *containsDenorms)
2287 {
2288     return sample_image_pixel_float_offset(
2289         imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset,
2290         zAddressOffset, imageSampler, outData, verbose, containsDenorms, 0);
2291 }
2292 
2293 
debug_find_vector_in_image(void * imagePtr,image_descriptor * imageInfo,void * vectorToFind,size_t vectorSize,int * outX,int * outY,int * outZ,size_t lod)2294 int debug_find_vector_in_image(void *imagePtr, image_descriptor *imageInfo,
2295                                void *vectorToFind, size_t vectorSize, int *outX,
2296                                int *outY, int *outZ, size_t lod)
2297 {
2298     int foundCount = 0;
2299     char *iPtr = (char *)imagePtr;
2300     size_t width;
2301     size_t depth;
2302     size_t height;
2303     size_t row_pitch;
2304     size_t slice_pitch;
2305 
2306     switch (imageInfo->type)
2307     {
2308         case CL_MEM_OBJECT_IMAGE1D:
2309             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2310             height = 1;
2311             depth = 1;
2312             break;
2313         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2314             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2315             height = 1;
2316             depth = imageInfo->arraySize;
2317             break;
2318         case CL_MEM_OBJECT_IMAGE2D:
2319             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2320             height =
2321                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2322             depth = 1;
2323             break;
2324         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2325             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2326             height =
2327                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2328             depth = imageInfo->arraySize;
2329             break;
2330         case CL_MEM_OBJECT_IMAGE3D:
2331             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2332             height =
2333                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2334             depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
2335             break;
2336     }
2337 
2338     row_pitch = width * get_pixel_size(imageInfo->format);
2339     slice_pitch = row_pitch * height;
2340 
2341     for (size_t z = 0; z < depth; z++)
2342     {
2343         for (size_t y = 0; y < height; y++)
2344         {
2345             for (size_t x = 0; x < width; x++)
2346             {
2347                 if (memcmp(iPtr, vectorToFind, vectorSize) == 0)
2348                 {
2349                     if (foundCount == 0)
2350                     {
2351                         *outX = (int)x;
2352                         if (outY != NULL) *outY = (int)y;
2353                         if (outZ != NULL) *outZ = (int)z;
2354                     }
2355                     foundCount++;
2356                 }
2357                 iPtr += vectorSize;
2358             }
2359             iPtr += row_pitch - (width * vectorSize);
2360         }
2361         iPtr += slice_pitch - (height * row_pitch);
2362     }
2363     return foundCount;
2364 }
2365 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,unsigned int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2366 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2367                               unsigned int *valuesToFind, int *outX, int *outY,
2368                               int *outZ, int lod)
2369 {
2370     char vectorToFind[4 * 4];
2371     size_t vectorSize = get_format_channel_count(imageInfo->format);
2372 
2373 
2374     if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8)
2375     {
2376         unsigned char *p = (unsigned char *)vectorToFind;
2377         for (unsigned int i = 0; i < vectorSize; i++)
2378             p[i] = (unsigned char)valuesToFind[i];
2379     }
2380     else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16)
2381     {
2382         unsigned short *p = (unsigned short *)vectorToFind;
2383         for (unsigned int i = 0; i < vectorSize; i++)
2384             p[i] = (unsigned short)valuesToFind[i];
2385         vectorSize *= 2;
2386     }
2387     else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32)
2388     {
2389         unsigned int *p = (unsigned int *)vectorToFind;
2390         for (unsigned int i = 0; i < vectorSize; i++)
2391             p[i] = (unsigned int)valuesToFind[i];
2392         vectorSize *= 4;
2393     }
2394     else
2395     {
2396         log_info("WARNING: Unable to search for debug pixel: invalid image "
2397                  "format\n");
2398         return false;
2399     }
2400     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2401                                       vectorSize, outX, outY, outZ, lod);
2402 }
2403 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2404 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2405                               int *valuesToFind, int *outX, int *outY,
2406                               int *outZ, int lod)
2407 {
2408     char vectorToFind[4 * 4];
2409     size_t vectorSize = get_format_channel_count(imageInfo->format);
2410 
2411     if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT8)
2412     {
2413         char *p = (char *)vectorToFind;
2414         for (unsigned int i = 0; i < vectorSize; i++)
2415             p[i] = (char)valuesToFind[i];
2416     }
2417     else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT16)
2418     {
2419         short *p = (short *)vectorToFind;
2420         for (unsigned int i = 0; i < vectorSize; i++)
2421             p[i] = (short)valuesToFind[i];
2422         vectorSize *= 2;
2423     }
2424     else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT32)
2425     {
2426         int *p = (int *)vectorToFind;
2427         for (unsigned int i = 0; i < vectorSize; i++)
2428             p[i] = (int)valuesToFind[i];
2429         vectorSize *= 4;
2430     }
2431     else
2432     {
2433         log_info("WARNING: Unable to search for debug pixel: invalid image "
2434                  "format\n");
2435         return false;
2436     }
2437     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2438                                       vectorSize, outX, outY, outZ, lod);
2439 }
2440 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,float * valuesToFind,int * outX,int * outY,int * outZ,int lod)2441 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2442                               float *valuesToFind, int *outX, int *outY,
2443                               int *outZ, int lod)
2444 {
2445     char vectorToFind[4 * 4];
2446     float swizzled[4];
2447     memcpy(swizzled, valuesToFind, sizeof(swizzled));
2448     size_t vectorSize = get_pixel_size(imageInfo->format);
2449     pack_image_pixel(swizzled, imageInfo->format, vectorToFind);
2450     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2451                                       vectorSize, outX, outY, outZ, lod);
2452 }
2453 
2454 template <class T>
swizzle_vector_for_image(T * srcVector,const cl_image_format * imageFormat)2455 void swizzle_vector_for_image(T *srcVector, const cl_image_format *imageFormat)
2456 {
2457     T temp;
2458     switch (imageFormat->image_channel_order)
2459     {
2460         case CL_A: srcVector[0] = srcVector[3]; break;
2461         case CL_R:
2462         case CL_Rx:
2463         case CL_RG:
2464         case CL_RGx:
2465         case CL_RGB:
2466         case CL_RGBx:
2467         case CL_RGBA:
2468         case CL_sRGB:
2469         case CL_sRGBx:
2470         case CL_sRGBA: break;
2471         case CL_RA: srcVector[1] = srcVector[3]; break;
2472         case CL_ARGB:
2473             temp = srcVector[3];
2474             srcVector[3] = srcVector[2];
2475             srcVector[2] = srcVector[1];
2476             srcVector[1] = srcVector[0];
2477             srcVector[0] = temp;
2478             break;
2479         case CL_ABGR:
2480             temp = srcVector[3];
2481             srcVector[3] = srcVector[0];
2482             srcVector[0] = temp;
2483             temp = srcVector[2];
2484             srcVector[2] = srcVector[1];
2485             srcVector[1] = temp;
2486             break;
2487         case CL_BGRA:
2488         case CL_sBGRA:
2489             temp = srcVector[0];
2490             srcVector[0] = srcVector[2];
2491             srcVector[2] = temp;
2492             break;
2493         case CL_INTENSITY:
2494             srcVector[3] = srcVector[0];
2495             srcVector[2] = srcVector[0];
2496             srcVector[1] = srcVector[0];
2497             break;
2498         case CL_LUMINANCE:
2499             srcVector[2] = srcVector[0];
2500             srcVector[1] = srcVector[0];
2501             break;
2502 #ifdef CL_1RGB_APPLE
2503         case CL_1RGB_APPLE:
2504             temp = srcVector[3];
2505             srcVector[3] = srcVector[2];
2506             srcVector[2] = srcVector[1];
2507             srcVector[1] = srcVector[0];
2508             srcVector[0] = temp;
2509             break;
2510 #endif
2511 #ifdef CL_BGR1_APPLE
2512         case CL_BGR1_APPLE:
2513             temp = srcVector[0];
2514             srcVector[0] = srcVector[2];
2515             srcVector[2] = temp;
2516             break;
2517 #endif
2518     }
2519 }
2520 
2521 #define SATURATE(v, min, max) (v < min ? min : (v > max ? max : v))
2522 
pack_image_pixel(unsigned int * srcVector,const cl_image_format * imageFormat,void * outData)2523 void pack_image_pixel(unsigned int *srcVector,
2524                       const cl_image_format *imageFormat, void *outData)
2525 {
2526     swizzle_vector_for_image<unsigned int>(srcVector, imageFormat);
2527     size_t channelCount = get_format_channel_count(imageFormat);
2528 
2529     switch (imageFormat->image_channel_data_type)
2530     {
2531         case CL_UNSIGNED_INT8: {
2532             unsigned char *ptr = (unsigned char *)outData;
2533             for (unsigned int i = 0; i < channelCount; i++)
2534                 ptr[i] = (unsigned char)SATURATE(srcVector[i], 0, 255);
2535             break;
2536         }
2537         case CL_UNSIGNED_INT16: {
2538             unsigned short *ptr = (unsigned short *)outData;
2539             for (unsigned int i = 0; i < channelCount; i++)
2540                 ptr[i] = (unsigned short)SATURATE(srcVector[i], 0, 65535);
2541             break;
2542         }
2543         case CL_UNSIGNED_INT32: {
2544             unsigned int *ptr = (unsigned int *)outData;
2545             for (unsigned int i = 0; i < channelCount; i++)
2546                 ptr[i] = (unsigned int)srcVector[i];
2547             break;
2548         }
2549         default: break;
2550     }
2551 }
2552 
pack_image_pixel(int * srcVector,const cl_image_format * imageFormat,void * outData)2553 void pack_image_pixel(int *srcVector, const cl_image_format *imageFormat,
2554                       void *outData)
2555 {
2556     swizzle_vector_for_image<int>(srcVector, imageFormat);
2557     size_t chanelCount = get_format_channel_count(imageFormat);
2558 
2559     switch (imageFormat->image_channel_data_type)
2560     {
2561         case CL_SIGNED_INT8: {
2562             char *ptr = (char *)outData;
2563             for (unsigned int i = 0; i < chanelCount; i++)
2564                 ptr[i] = (char)SATURATE(srcVector[i], -128, 127);
2565             break;
2566         }
2567         case CL_SIGNED_INT16: {
2568             short *ptr = (short *)outData;
2569             for (unsigned int i = 0; i < chanelCount; i++)
2570                 ptr[i] = (short)SATURATE(srcVector[i], -32768, 32767);
2571             break;
2572         }
2573         case CL_SIGNED_INT32: {
2574             int *ptr = (int *)outData;
2575             for (unsigned int i = 0; i < chanelCount; i++)
2576                 ptr[i] = (int)srcVector[i];
2577             break;
2578         }
2579         default: break;
2580     }
2581 }
2582 
round_to_even(float v)2583 int round_to_even(float v)
2584 {
2585     // clamp overflow
2586     if (v >= -(float)INT_MIN) return INT_MAX;
2587     if (v <= (float)INT_MIN) return INT_MIN;
2588 
2589     // round fractional values to integer value
2590     if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23))
2591     {
2592         static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23),
2593                                         MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
2594         float magicVal = magic[v < 0.0f];
2595         v += magicVal;
2596         v -= magicVal;
2597     }
2598 
2599     return (int)v;
2600 }
2601 
pack_image_pixel(float * srcVector,const cl_image_format * imageFormat,void * outData)2602 void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat,
2603                       void *outData)
2604 {
2605     swizzle_vector_for_image<float>(srcVector, imageFormat);
2606     size_t channelCount = get_format_channel_count(imageFormat);
2607     switch (imageFormat->image_channel_data_type)
2608     {
2609         case CL_HALF_FLOAT: {
2610             cl_half *ptr = (cl_half *)outData;
2611 
2612             switch (gFloatToHalfRoundingMode)
2613             {
2614                 case kRoundToNearestEven:
2615                     for (unsigned int i = 0; i < channelCount; i++)
2616                         ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTE);
2617                     break;
2618                 case kRoundTowardZero:
2619                     for (unsigned int i = 0; i < channelCount; i++)
2620                         ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTZ);
2621                     break;
2622                 default:
2623                     log_error("ERROR: Test internal error -- unhandled or "
2624                               "unknown float->half rounding mode.\n");
2625                     exit(-1);
2626                     break;
2627             }
2628             break;
2629         }
2630 
2631         case CL_FLOAT: {
2632             cl_float *ptr = (cl_float *)outData;
2633             for (unsigned int i = 0; i < channelCount; i++)
2634                 ptr[i] = srcVector[i];
2635             break;
2636         }
2637 
2638         case CL_SNORM_INT8: {
2639             cl_char *ptr = (cl_char *)outData;
2640             for (unsigned int i = 0; i < channelCount; i++)
2641                 ptr[i] =
2642                     (cl_char)NORMALIZE_SIGNED(srcVector[i], -127.0f, 127.f);
2643             break;
2644         }
2645         case CL_SNORM_INT16: {
2646             cl_short *ptr = (cl_short *)outData;
2647             for (unsigned int i = 0; i < channelCount; i++)
2648                 ptr[i] =
2649                     (short)NORMALIZE_SIGNED(srcVector[i], -32767.f, 32767.f);
2650             break;
2651         }
2652         case CL_UNORM_INT8: {
2653             cl_uchar *ptr = (cl_uchar *)outData;
2654             if (is_sRGBA_order(imageFormat->image_channel_order))
2655             {
2656                 ptr[0] = (unsigned char)(sRGBmap(srcVector[0]) + 0.5);
2657                 ptr[1] = (unsigned char)(sRGBmap(srcVector[1]) + 0.5);
2658                 ptr[2] = (unsigned char)(sRGBmap(srcVector[2]) + 0.5);
2659                 if (channelCount == 4)
2660                     ptr[3] = (unsigned char)NORMALIZE(srcVector[3], 255.f);
2661             }
2662             else
2663             {
2664                 for (unsigned int i = 0; i < channelCount; i++)
2665                     ptr[i] = (unsigned char)NORMALIZE(srcVector[i], 255.f);
2666             }
2667 #ifdef CL_1RGB_APPLE
2668             if (imageFormat->image_channel_order == CL_1RGB_APPLE)
2669                 ptr[0] = 255.0f;
2670 #endif
2671 #ifdef CL_BGR1_APPLE
2672             if (imageFormat->image_channel_order == CL_BGR1_APPLE)
2673                 ptr[3] = 255.0f;
2674 #endif
2675             break;
2676         }
2677         case CL_UNORM_INT16: {
2678             cl_ushort *ptr = (cl_ushort *)outData;
2679             for (unsigned int i = 0; i < channelCount; i++)
2680                 ptr[i] = (unsigned short)NORMALIZE(srcVector[i], 65535.f);
2681             break;
2682         }
2683         case CL_UNORM_SHORT_555: {
2684             cl_ushort *ptr = (cl_ushort *)outData;
2685             ptr[0] =
2686                 (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 10)
2687                 | (((unsigned short)NORMALIZE(srcVector[1], 31.f) & 31) << 5)
2688                 | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
2689             break;
2690         }
2691         case CL_UNORM_SHORT_565: {
2692             cl_ushort *ptr = (cl_ushort *)outData;
2693             ptr[0] =
2694                 (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 11)
2695                 | (((unsigned short)NORMALIZE(srcVector[1], 63.f) & 63) << 5)
2696                 | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
2697             break;
2698         }
2699         case CL_UNORM_INT_101010: {
2700             cl_uint *ptr = (cl_uint *)outData;
2701             ptr[0] =
2702                 (((unsigned int)NORMALIZE(srcVector[0], 1023.f) & 1023) << 20)
2703                 | (((unsigned int)NORMALIZE(srcVector[1], 1023.f) & 1023) << 10)
2704                 | (((unsigned int)NORMALIZE(srcVector[2], 1023.f) & 1023) << 0);
2705             break;
2706         }
2707         case CL_SIGNED_INT8: {
2708             cl_char *ptr = (cl_char *)outData;
2709             for (unsigned int i = 0; i < channelCount; i++)
2710                 ptr[i] =
2711                     (cl_char)CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);
2712             break;
2713         }
2714         case CL_SIGNED_INT16: {
2715             cl_short *ptr = (cl_short *)outData;
2716             for (unsigned int i = 0; i < channelCount; i++)
2717                 ptr[i] =
2718                     (short)CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
2719             break;
2720         }
2721         case CL_SIGNED_INT32: {
2722             cl_int *ptr = (cl_int *)outData;
2723             for (unsigned int i = 0; i < channelCount; i++)
2724                 ptr[i] = (int)CONVERT_INT(
2725                     srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31),
2726                     MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23),
2727                     CL_INT_MAX);
2728             break;
2729         }
2730         case CL_UNSIGNED_INT8: {
2731             cl_uchar *ptr = (cl_uchar *)outData;
2732             for (unsigned int i = 0; i < channelCount; i++)
2733                 ptr[i] =
2734                     (cl_uchar)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
2735             break;
2736         }
2737         case CL_UNSIGNED_INT16: {
2738             cl_ushort *ptr = (cl_ushort *)outData;
2739             for (unsigned int i = 0; i < channelCount; i++)
2740                 ptr[i] = (cl_ushort)CONVERT_UINT(srcVector[i], 32767.f,
2741                                                  CL_USHRT_MAX);
2742             break;
2743         }
2744         case CL_UNSIGNED_INT32: {
2745             cl_uint *ptr = (cl_uint *)outData;
2746             for (unsigned int i = 0; i < channelCount; i++)
2747                 ptr[i] = (cl_uint)CONVERT_UINT(
2748                     srcVector[i],
2749                     MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
2750                     CL_UINT_MAX);
2751             break;
2752         }
2753 #ifdef CL_SFIXED14_APPLE
2754         case CL_SFIXED14_APPLE: {
2755             cl_ushort *ptr = (cl_ushort *)outData;
2756             for (unsigned int i = 0; i < channelCount; i++)
2757             {
2758                 cl_float f = fmaxf(srcVector[i], -1.0f);
2759                 f = fminf(f, 3.0f);
2760                 cl_int d = rintf(f * 0x1.0p14f);
2761                 d += 16384;
2762                 if (d > CL_USHRT_MAX) d = CL_USHRT_MAX;
2763                 ptr[i] = d;
2764             }
2765             break;
2766         }
2767 #endif
2768         default:
2769             log_error("INTERNAL ERROR: unknown format (%d)\n",
2770                       imageFormat->image_channel_data_type);
2771             exit(-1);
2772             break;
2773     }
2774 }
2775 
pack_image_pixel_error(const float * srcVector,const cl_image_format * imageFormat,const void * results,float * errors)2776 void pack_image_pixel_error(const float *srcVector,
2777                             const cl_image_format *imageFormat,
2778                             const void *results, float *errors)
2779 {
2780     size_t channelCount = get_format_channel_count(imageFormat);
2781     switch (imageFormat->image_channel_data_type)
2782     {
2783         case CL_HALF_FLOAT: {
2784             const cl_half *ptr = (const cl_half *)results;
2785 
2786             for (unsigned int i = 0; i < channelCount; i++)
2787                 errors[i] = Ulp_Error_Half(ptr[i], srcVector[i]);
2788 
2789             break;
2790         }
2791 
2792         case CL_FLOAT: {
2793             const cl_ushort *ptr = (const cl_ushort *)results;
2794 
2795             for (unsigned int i = 0; i < channelCount; i++)
2796                 errors[i] = Ulp_Error(ptr[i], srcVector[i]);
2797 
2798             break;
2799         }
2800 
2801         case CL_SNORM_INT8: {
2802             const cl_char *ptr = (const cl_char *)results;
2803 
2804             for (unsigned int i = 0; i < channelCount; i++)
2805                 errors[i] = ptr[i]
2806                     - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -127.0f, 127.f);
2807 
2808             break;
2809         }
2810         case CL_SNORM_INT16: {
2811             const cl_short *ptr = (const cl_short *)results;
2812 
2813             for (unsigned int i = 0; i < channelCount; i++)
2814                 errors[i] = ptr[i]
2815                     - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -32767.f,
2816                                                  32767.f);
2817 
2818             break;
2819         }
2820         case CL_UNORM_INT8: {
2821             const cl_uchar *ptr = (const cl_uchar *)results;
2822 
2823             for (unsigned int i = 0; i < channelCount; i++)
2824                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 255.f);
2825 
2826             break;
2827         }
2828         case CL_UNORM_INT16: {
2829             const cl_ushort *ptr = (const cl_ushort *)results;
2830 
2831             for (unsigned int i = 0; i < channelCount; i++)
2832                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 65535.f);
2833 
2834             break;
2835         }
2836         case CL_UNORM_SHORT_555: {
2837             const cl_ushort *ptr = (const cl_ushort *)results;
2838 
2839             errors[0] =
2840                 ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
2841             errors[1] =
2842                 ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED(srcVector[1], 31.f);
2843             errors[2] =
2844                 ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);
2845 
2846             break;
2847         }
2848         case CL_UNORM_SHORT_565: {
2849             const cl_ushort *ptr = (const cl_ushort *)results;
2850 
2851             errors[0] =
2852                 ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
2853             errors[1] =
2854                 ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED(srcVector[1], 63.f);
2855             errors[2] =
2856                 ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);
2857 
2858             break;
2859         }
2860         case CL_UNORM_INT_101010: {
2861             const cl_uint *ptr = (const cl_uint *)results;
2862 
2863             errors[0] = ((ptr[0] >> 20) & 1023)
2864                 - NORMALIZE_UNROUNDED(srcVector[0], 1023.f);
2865             errors[1] = ((ptr[0] >> 10) & 1023)
2866                 - NORMALIZE_UNROUNDED(srcVector[1], 1023.f);
2867             errors[2] = ((ptr[0] >> 0) & 1023)
2868                 - NORMALIZE_UNROUNDED(srcVector[2], 1023.f);
2869 
2870             break;
2871         }
2872         case CL_SIGNED_INT8: {
2873             const cl_char *ptr = (const cl_char *)results;
2874 
2875             for (unsigned int i = 0; i < channelCount; i++)
2876                 errors[i] =
2877                     ptr[i] - CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);
2878 
2879             break;
2880         }
2881         case CL_SIGNED_INT16: {
2882             const cl_short *ptr = (const cl_short *)results;
2883             for (unsigned int i = 0; i < channelCount; i++)
2884                 errors[i] = ptr[i]
2885                     - CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
2886             break;
2887         }
2888         case CL_SIGNED_INT32: {
2889             const cl_int *ptr = (const cl_int *)results;
2890             for (unsigned int i = 0; i < channelCount; i++)
2891                 errors[i] = (cl_float)(
2892                     (cl_long)ptr[i]
2893                     - (cl_long)CONVERT_INT(
2894                         srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31),
2895                         MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23),
2896                         CL_INT_MAX));
2897             break;
2898         }
2899         case CL_UNSIGNED_INT8: {
2900             const cl_uchar *ptr = (const cl_uchar *)results;
2901             for (unsigned int i = 0; i < channelCount; i++)
2902                 errors[i] = (cl_int)ptr[i]
2903                     - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
2904             break;
2905         }
2906         case CL_UNSIGNED_INT16: {
2907             const cl_ushort *ptr = (const cl_ushort *)results;
2908             for (unsigned int i = 0; i < channelCount; i++)
2909                 errors[i] = (cl_int)ptr[i]
2910                     - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX);
2911             break;
2912         }
2913         case CL_UNSIGNED_INT32: {
2914             const cl_uint *ptr = (const cl_uint *)results;
2915             for (unsigned int i = 0; i < channelCount; i++)
2916                 errors[i] = (cl_float)(
2917                     (cl_long)ptr[i]
2918                     - (cl_long)CONVERT_UINT(
2919                         srcVector[i],
2920                         MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
2921                         CL_UINT_MAX));
2922             break;
2923         }
2924 #ifdef CL_SFIXED14_APPLE
2925         case CL_SFIXED14_APPLE: {
2926             const cl_ushort *ptr = (const cl_ushort *)results;
2927 
2928             for (unsigned int i = 0; i < channelCount; i++)
2929                 errors[i] = ptr[i]
2930                     - NORMALIZE_SIGNED_UNROUNDED(((int)srcVector[i] - 16384),
2931                                                  -16384.f, 49151.f);
2932 
2933             break;
2934         }
2935 #endif
2936         default:
2937             log_error("INTERNAL ERROR: unknown format (%d)\n",
2938                       imageFormat->image_channel_data_type);
2939             exit(-1);
2940             break;
2941     }
2942 }
2943 
2944 
2945 //
2946 //  Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
2947 //  This should be called lazily before attempting to verify image writes,
2948 //  otherwise an error will occur.
2949 //
DetectFloatToHalfRoundingMode(cl_command_queue q)2950 int DetectFloatToHalfRoundingMode(
2951     cl_command_queue q) // Returns CL_SUCCESS on success
2952 {
2953     cl_int err = CL_SUCCESS;
2954 
2955     if (gFloatToHalfRoundingMode == kDefaultRoundingMode)
2956     {
2957         // Some numbers near 0.5f, that we look at to see how the values are
2958         // rounded.
2959         static const cl_uint inData[4 * 4] = {
2960             0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U,
2961             0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
2962             0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U,
2963             0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U
2964         };
2965         static const size_t count = sizeof(inData) / (4 * sizeof(inData[0]));
2966         const float *inp = (const float *)inData;
2967         cl_context context = NULL;
2968 
2969         // Create an input buffer
2970         err = clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(context),
2971                                     &context, NULL);
2972         if (err)
2973         {
2974             log_error("Error:  could not get context from command queue in "
2975                       "DetectFloatToHalfRoundingMode  (%d)",
2976                       err);
2977             return err;
2978         }
2979 
2980         cl_mem inBuf = clCreateBuffer(context,
2981                                       CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
2982                                           | CL_MEM_ALLOC_HOST_PTR,
2983                                       sizeof(inData), (void *)inData, &err);
2984         if (NULL == inBuf || err)
2985         {
2986             log_error("Error:  could not create input buffer in "
2987                       "DetectFloatToHalfRoundingMode  (err: %d)",
2988                       err);
2989             return err;
2990         }
2991 
2992         // Create a small output image
2993         cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
2994         cl_mem outImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &fmt,
2995                                           count, 1, 0, NULL, &err);
2996         if (NULL == outImage || err)
2997         {
2998             log_error("Error:  could not create half float out image in "
2999                       "DetectFloatToHalfRoundingMode  (err: %d)",
3000                       err);
3001             clReleaseMemObject(inBuf);
3002             return err;
3003         }
3004 
3005         // Create our program, and a kernel
3006         const char *kernelSource[1] = {
3007             "kernel void detect_round( global float4 *in, write_only image2d_t "
3008             "out )\n"
3009             "{\n"
3010             "   write_imagef( out, (int2)(get_global_id(0),0), "
3011             "in[get_global_id(0)] );\n"
3012             "}\n"
3013         };
3014 
3015         clProgramWrapper program;
3016         clKernelWrapper kernel;
3017         err = create_single_kernel_helper(context, &program, &kernel, 1,
3018                                           kernelSource, "detect_round");
3019 
3020         if (NULL == program || err)
3021         {
3022             log_error("Error:  could not create program in "
3023                       "DetectFloatToHalfRoundingMode (err: %d)",
3024                       err);
3025             clReleaseMemObject(inBuf);
3026             clReleaseMemObject(outImage);
3027             return err;
3028         }
3029 
3030         cl_device_id device = NULL;
3031         err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(device), &device,
3032                                     NULL);
3033         if (err)
3034         {
3035             log_error("Error:  could not get device from command queue in "
3036                       "DetectFloatToHalfRoundingMode  (%d)",
3037                       err);
3038             clReleaseMemObject(inBuf);
3039             clReleaseMemObject(outImage);
3040             return err;
3041         }
3042 
3043         err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inBuf);
3044         if (err)
3045         {
3046             log_error("Error: could not set argument 0 of kernel in "
3047                       "DetectFloatToHalfRoundingMode (%d)",
3048                       err);
3049             clReleaseMemObject(inBuf);
3050             clReleaseMemObject(outImage);
3051             return err;
3052         }
3053 
3054         err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outImage);
3055         if (err)
3056         {
3057             log_error("Error: could not set argument 1 of kernel in "
3058                       "DetectFloatToHalfRoundingMode (%d)",
3059                       err);
3060             clReleaseMemObject(inBuf);
3061             clReleaseMemObject(outImage);
3062             return err;
3063         }
3064 
3065         // Run the kernel
3066         size_t global_work_size = count;
3067         err = clEnqueueNDRangeKernel(q, kernel, 1, NULL, &global_work_size,
3068                                      NULL, 0, NULL, NULL);
3069         if (err)
3070         {
3071             log_error("Error: could not enqueue kernel in "
3072                       "DetectFloatToHalfRoundingMode (%d)",
3073                       err);
3074             clReleaseMemObject(inBuf);
3075             clReleaseMemObject(outImage);
3076             return err;
3077         }
3078 
3079         // read the results
3080         cl_half outBuf[count * 4];
3081         memset(outBuf, -1, sizeof(outBuf));
3082         size_t origin[3] = { 0, 0, 0 };
3083         size_t region[3] = { count, 1, 1 };
3084         err = clEnqueueReadImage(q, outImage, CL_TRUE, origin, region, 0, 0,
3085                                  outBuf, 0, NULL, NULL);
3086         if (err)
3087         {
3088             log_error("Error: could not read output image in "
3089                       "DetectFloatToHalfRoundingMode (%d)",
3090                       err);
3091             clReleaseMemObject(inBuf);
3092             clReleaseMemObject(outImage);
3093             return err;
3094         }
3095 
3096         // Generate our list of reference results
3097         cl_half rte_ref[count * 4];
3098         cl_half rtz_ref[count * 4];
3099         for (size_t i = 0; i < 4 * count; i++)
3100         {
3101             rte_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTE);
3102             rtz_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTZ);
3103         }
3104 
3105         // Verify that we got something in either rtz or rte mode
3106         if (0 == memcmp(rte_ref, outBuf, sizeof(rte_ref)))
3107         {
3108             log_info("Autodetected float->half rounding mode to be rte\n");
3109             gFloatToHalfRoundingMode = kRoundToNearestEven;
3110         }
3111         else if (0 == memcmp(rtz_ref, outBuf, sizeof(rtz_ref)))
3112         {
3113             log_info("Autodetected float->half rounding mode to be rtz\n");
3114             gFloatToHalfRoundingMode = kRoundTowardZero;
3115         }
3116         else
3117         {
3118             log_error("ERROR: float to half conversions proceed with invalid "
3119                       "rounding mode!\n");
3120             log_info("\nfor:");
3121             for (size_t i = 0; i < count; i++)
3122                 log_info(" {%a, %a, %a, %a},", inp[4 * i], inp[4 * i + 1],
3123                          inp[4 * i + 2], inp[4 * i + 3]);
3124             log_info("\ngot:");
3125             for (size_t i = 0; i < count; i++)
3126                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3127                          outBuf[4 * i], outBuf[4 * i + 1], outBuf[4 * i + 2],
3128                          outBuf[4 * i + 3]);
3129             log_info("\nrte:");
3130             for (size_t i = 0; i < count; i++)
3131                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3132                          rte_ref[4 * i], rte_ref[4 * i + 1], rte_ref[4 * i + 2],
3133                          rte_ref[4 * i + 3]);
3134             log_info("\nrtz:");
3135             for (size_t i = 0; i < count; i++)
3136                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3137                          rtz_ref[4 * i], rtz_ref[4 * i + 1], rtz_ref[4 * i + 2],
3138                          rtz_ref[4 * i + 3]);
3139             log_info("\n");
3140             err = -1;
3141             gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value
3142         }
3143 
3144         // clean up
3145         clReleaseMemObject(inBuf);
3146         clReleaseMemObject(outImage);
3147         return err;
3148     }
3149 
3150     // Make sure that the rounding mode was successfully detected, if we checked
3151     // earlier
3152     if (gFloatToHalfRoundingMode != kRoundToNearestEven
3153         && gFloatToHalfRoundingMode != kRoundTowardZero)
3154         return -2;
3155 
3156     return err;
3157 }
3158 
create_random_image_data(ExplicitType dataType,image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d,bool image2DFromBuffer)3159 char *create_random_image_data(ExplicitType dataType,
3160                                image_descriptor *imageInfo,
3161                                BufferOwningPtr<char> &P, MTdata d,
3162                                bool image2DFromBuffer)
3163 {
3164     size_t allocSize, numPixels;
3165     if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
3166     {
3167         allocSize = (size_t)(compute_mipmapped_image_size(*imageInfo) * 4
3168                              * get_explicit_type_size(dataType))
3169             / get_pixel_size(imageInfo->format);
3170         numPixels = allocSize / (get_explicit_type_size(dataType) * 4);
3171     }
3172     else
3173     {
3174         numPixels = (image2DFromBuffer ? imageInfo->rowPitch : imageInfo->width)
3175             * imageInfo->height * (imageInfo->depth ? imageInfo->depth : 1)
3176             * (imageInfo->arraySize ? imageInfo->arraySize : 1);
3177         allocSize = numPixels * 4 * get_explicit_type_size(dataType);
3178     }
3179 
3180 #if 0 // DEBUG
3181     {
3182       fprintf(stderr,"--- create_random_image_data:\n");
3183       fprintf(stderr,"allocSize = %zu\n",allocSize);
3184       fprintf(stderr,"numPixels = %zu\n",numPixels);
3185       fprintf(stderr,"width = %zu\n",imageInfo->width);
3186       fprintf(stderr,"height = %zu\n",imageInfo->height);
3187       fprintf(stderr,"depth = %zu\n",imageInfo->depth);
3188       fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch);
3189       fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch);
3190       fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize);
3191       fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType));
3192     }
3193 #endif
3194 
3195 #if defined(__APPLE__)
3196     char *data = NULL;
3197     if (gDeviceType == CL_DEVICE_TYPE_CPU)
3198     {
3199         size_t mapSize =
3200             ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages.
3201 
3202         void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
3203                          MAP_ANON | MAP_PRIVATE, 0, 0);
3204         if (map == MAP_FAILED)
3205         {
3206             perror("create_random_image_data: mmap");
3207             log_error("%s:%d: mmap failed, mapSize = %zu\n", __FILE__, __LINE__,
3208                       mapSize);
3209         }
3210         intptr_t data_end = (intptr_t)map + mapSize - 4096;
3211         data = (char *)(data_end - (intptr_t)allocSize);
3212 
3213         mprotect(map, 4096, PROT_NONE);
3214         mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
3215         P.reset(data, map, mapSize);
3216     }
3217     else
3218     {
3219         data = (char *)malloc(allocSize);
3220         P.reset(data);
3221     }
3222 #else
3223     char *data =
3224         (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
3225     P.reset(data, NULL, 0, allocSize, true);
3226 #endif
3227 
3228     if (data == NULL)
3229     {
3230         log_error(
3231             "ERROR: Unable to malloc %lu bytes for create_random_image_data\n",
3232             allocSize);
3233         return NULL;
3234     }
3235 
3236     switch (dataType)
3237     {
3238         case kFloat: {
3239             float *inputValues = (float *)data;
3240             switch (imageInfo->format->image_channel_data_type)
3241             {
3242                 case CL_HALF_FLOAT: {
3243                     // Generate data that is (mostly) inside the range of a half
3244                     // float const float HALF_MIN = 5.96046448e-08f;
3245                     const float HALF_MAX = 65504.0f;
3246 
3247                     size_t i = 0;
3248                     inputValues[i++] = 0.f;
3249                     inputValues[i++] = 1.f;
3250                     inputValues[i++] = -1.f;
3251                     inputValues[i++] = 2.f;
3252                     for (; i < numPixels * 4; i++)
3253                         inputValues[i] = get_random_float(-HALF_MAX - 2.f,
3254                                                           HALF_MAX + 2.f, d);
3255                 }
3256                 break;
3257 #ifdef CL_SFIXED14_APPLE
3258                 case CL_SFIXED14_APPLE: {
3259                     size_t i = 0;
3260                     if (numPixels * 4 >= 8)
3261                     {
3262                         inputValues[i++] = INFINITY;
3263                         inputValues[i++] = 0x1.0p14f;
3264                         inputValues[i++] = 0x1.0p31f;
3265                         inputValues[i++] = 0x1.0p32f;
3266                         inputValues[i++] = -INFINITY;
3267                         inputValues[i++] = -0x1.0p14f;
3268                         inputValues[i++] = -0x1.0p31f;
3269                         inputValues[i++] = -0x1.1p31f;
3270                     }
3271                     for (; i < numPixels * 4; i++)
3272                         inputValues[i] = get_random_float(-1.1f, 3.1f, d);
3273                 }
3274                 break;
3275 #endif
3276                 case CL_FLOAT: {
3277                     size_t i = 0;
3278                     inputValues[i++] = INFINITY;
3279                     inputValues[i++] = -INFINITY;
3280                     inputValues[i++] = 0.0f;
3281                     inputValues[i++] = 0.0f;
3282                     cl_uint *p = (cl_uint *)data;
3283                     for (; i < numPixels * 4; i++) p[i] = genrand_int32(d);
3284                 }
3285                 break;
3286 
3287                 default:
3288                     size_t i = 0;
3289                     if (numPixels * 4 >= 36)
3290                     {
3291                         inputValues[i++] = 0.0f;
3292                         inputValues[i++] = 0.5f;
3293                         inputValues[i++] = 31.5f;
3294                         inputValues[i++] = 32.0f;
3295                         inputValues[i++] = 127.5f;
3296                         inputValues[i++] = 128.0f;
3297                         inputValues[i++] = 255.5f;
3298                         inputValues[i++] = 256.0f;
3299                         inputValues[i++] = 1023.5f;
3300                         inputValues[i++] = 1024.0f;
3301                         inputValues[i++] = 32767.5f;
3302                         inputValues[i++] = 32768.0f;
3303                         inputValues[i++] = 65535.5f;
3304                         inputValues[i++] = 65536.0f;
3305                         inputValues[i++] = 2147483648.0f;
3306                         inputValues[i++] = 4294967296.0f;
3307                         inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
3308                         inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
3309                         inputValues[i++] = -0.0f;
3310                         inputValues[i++] = -0.5f;
3311                         inputValues[i++] = -31.5f;
3312                         inputValues[i++] = -32.0f;
3313                         inputValues[i++] = -127.5f;
3314                         inputValues[i++] = -128.0f;
3315                         inputValues[i++] = -255.5f;
3316                         inputValues[i++] = -256.0f;
3317                         inputValues[i++] = -1023.5f;
3318                         inputValues[i++] = -1024.0f;
3319                         inputValues[i++] = -32767.5f;
3320                         inputValues[i++] = -32768.0f;
3321                         inputValues[i++] = -65535.5f;
3322                         inputValues[i++] = -65536.0f;
3323                         inputValues[i++] = -2147483648.0f;
3324                         inputValues[i++] = -4294967296.0f;
3325                         inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
3326                         inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
3327                     }
3328                     if (is_format_signed(imageInfo->format))
3329                     {
3330                         for (; i < numPixels * 4; i++)
3331                             inputValues[i] = get_random_float(-1.1f, 1.1f, d);
3332                     }
3333                     else
3334                     {
3335                         for (; i < numPixels * 4; i++)
3336                             inputValues[i] = get_random_float(-0.1f, 1.1f, d);
3337                     }
3338                     break;
3339             }
3340             break;
3341         }
3342 
3343         case kInt: {
3344             int *imageData = (int *)data;
3345 
3346             // We want to generate ints (mostly) in range of the target format
3347             int formatMin = get_format_min_int(imageInfo->format);
3348             size_t formatMax = get_format_max_int(imageInfo->format);
3349             if (formatMin == 0)
3350             {
3351                 // Unsigned values, but we are only an int, so cap the actual
3352                 // max at the max of signed ints
3353                 if (formatMax > 2147483647L) formatMax = 2147483647L;
3354             }
3355             // If the final format is small enough, give us a bit of room for
3356             // out-of-range values to test
3357             if (formatMax < 2147483647L) formatMax += 2;
3358             if (formatMin > -2147483648LL) formatMin -= 2;
3359 
3360             // Now gen
3361             for (size_t i = 0; i < numPixels * 4; i++)
3362             {
3363                 imageData[i] = random_in_range(formatMin, (int)formatMax, d);
3364             }
3365             break;
3366         }
3367 
3368         case kUInt:
3369         case kUnsignedInt: {
3370             unsigned int *imageData = (unsigned int *)data;
3371 
3372             // We want to generate ints (mostly) in range of the target format
3373             int formatMin = get_format_min_int(imageInfo->format);
3374             size_t formatMax = get_format_max_int(imageInfo->format);
3375             if (formatMin < 0) formatMin = 0;
3376             // If the final format is small enough, give us a bit of room for
3377             // out-of-range values to test
3378             if (formatMax < 4294967295LL) formatMax += 2;
3379 
3380             // Now gen
3381             for (size_t i = 0; i < numPixels * 4; i++)
3382             {
3383                 imageData[i] = random_in_range(formatMin, (int)formatMax, d);
3384             }
3385             break;
3386         }
3387         default:
3388             // Unsupported source format
3389             delete[] data;
3390             return NULL;
3391     }
3392 
3393     return data;
3394 }
3395 
3396 /*
3397     deprecated
3398 bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t
3399 max, int &outValue )
3400 {
3401     int v = (int)value;
3402 
3403     switch(imageSampler->addressing_mode)
3404     {
3405         case CL_ADDRESS_REPEAT:
3406             outValue = v;
3407             while( v < 0 )
3408                 v += (int)max;
3409             while( v >= (int)max )
3410                 v -= (int)max;
3411             if( v != outValue )
3412             {
3413                 outValue = v;
3414                 return true;
3415             }
3416             return false;
3417 
3418         case CL_ADDRESS_MIRRORED_REPEAT:
3419             log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do
3420 we ever use this? exit(-1);
3421 
3422         default:
3423             if( v < 0 )
3424             {
3425                 outValue = 0;
3426                 return true;
3427             }
3428             if( v >= (int)max )
3429             {
3430                 outValue = (int)max - 1;
3431                 return true;
3432             }
3433             outValue = v;
3434             return false;
3435     }
3436 
3437 }
3438 */
3439 
get_sampler_kernel_code(image_sampler_data * imageSampler,char * outLine)3440 void get_sampler_kernel_code(image_sampler_data *imageSampler, char *outLine)
3441 {
3442     const char *normalized;
3443     const char *addressMode;
3444     const char *filterMode;
3445 
3446     if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP)
3447         addressMode = "CLK_ADDRESS_CLAMP";
3448     else if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
3449         addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
3450     else if (imageSampler->addressing_mode == CL_ADDRESS_REPEAT)
3451         addressMode = "CLK_ADDRESS_REPEAT";
3452     else if (imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
3453         addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
3454     else if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
3455         addressMode = "CLK_ADDRESS_NONE";
3456     else
3457     {
3458         log_error("**Error: Unknown addressing mode! Aborting...\n");
3459         abort();
3460     }
3461 
3462     if (imageSampler->normalized_coords)
3463         normalized = "CLK_NORMALIZED_COORDS_TRUE";
3464     else
3465         normalized = "CLK_NORMALIZED_COORDS_FALSE";
3466 
3467     if (imageSampler->filter_mode == CL_FILTER_LINEAR)
3468         filterMode = "CLK_FILTER_LINEAR";
3469     else
3470         filterMode = "CLK_FILTER_NEAREST";
3471 
3472     sprintf(outLine, "    const sampler_t imageSampler = %s | %s | %s;\n",
3473             addressMode, filterMode, normalized);
3474 }
3475 
copy_image_data(image_descriptor * srcImageInfo,image_descriptor * dstImageInfo,void * imageValues,void * destImageValues,const size_t sourcePos[],const size_t destPos[],const size_t regionSize[])3476 void copy_image_data(image_descriptor *srcImageInfo,
3477                      image_descriptor *dstImageInfo, void *imageValues,
3478                      void *destImageValues, const size_t sourcePos[],
3479                      const size_t destPos[], const size_t regionSize[])
3480 {
3481     //  assert( srcImageInfo->format == dstImageInfo->format );
3482 
3483     size_t src_mip_level_offset = 0, dst_mip_level_offset = 0;
3484     size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod;
3485     size_t src_row_pitch_lod, src_slice_pitch_lod;
3486     size_t dst_row_pitch_lod, dst_slice_pitch_lod;
3487 
3488     size_t pixelSize = get_pixel_size(srcImageInfo->format);
3489 
3490     sourcePos_lod[0] = sourcePos[0];
3491     sourcePos_lod[1] = sourcePos[1];
3492     sourcePos_lod[2] = sourcePos[2];
3493     destPos_lod[0] = destPos[0];
3494     destPos_lod[1] = destPos[1];
3495     destPos_lod[2] = destPos[2];
3496     src_row_pitch_lod = srcImageInfo->rowPitch;
3497     dst_row_pitch_lod = dstImageInfo->rowPitch;
3498     src_slice_pitch_lod = srcImageInfo->slicePitch;
3499     dst_slice_pitch_lod = dstImageInfo->slicePitch;
3500 
3501     if (srcImageInfo->num_mip_levels > 1)
3502     {
3503         size_t src_width_lod = 1 /*srcImageInfo->width*/;
3504         size_t src_height_lod = 1 /*srcImageInfo->height*/;
3505         size_t src_depth_lod = 1 /*srcImageInfo->depth*/;
3506 
3507         switch (srcImageInfo->type)
3508         {
3509             case CL_MEM_OBJECT_IMAGE1D:
3510                 src_lod = sourcePos[1];
3511                 sourcePos_lod[1] = sourcePos_lod[2] = 0;
3512                 src_width_lod = (srcImageInfo->width >> src_lod)
3513                     ? (srcImageInfo->width >> src_lod)
3514                     : 1;
3515                 break;
3516             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3517             case CL_MEM_OBJECT_IMAGE2D:
3518                 src_lod = sourcePos[2];
3519                 sourcePos_lod[1] = sourcePos[1];
3520                 sourcePos_lod[2] = 0;
3521                 src_width_lod = (srcImageInfo->width >> src_lod)
3522                     ? (srcImageInfo->width >> src_lod)
3523                     : 1;
3524                 if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
3525                     src_height_lod = (srcImageInfo->height >> src_lod)
3526                         ? (srcImageInfo->height >> src_lod)
3527                         : 1;
3528                 break;
3529             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3530             case CL_MEM_OBJECT_IMAGE3D:
3531                 src_lod = sourcePos[3];
3532                 sourcePos_lod[1] = sourcePos[1];
3533                 sourcePos_lod[2] = sourcePos[2];
3534                 src_width_lod = (srcImageInfo->width >> src_lod)
3535                     ? (srcImageInfo->width >> src_lod)
3536                     : 1;
3537                 src_height_lod = (srcImageInfo->height >> src_lod)
3538                     ? (srcImageInfo->height >> src_lod)
3539                     : 1;
3540                 if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
3541                     src_depth_lod = (srcImageInfo->depth >> src_lod)
3542                         ? (srcImageInfo->depth >> src_lod)
3543                         : 1;
3544                 break;
3545         }
3546         src_mip_level_offset = compute_mip_level_offset(srcImageInfo, src_lod);
3547         src_row_pitch_lod =
3548             src_width_lod * get_pixel_size(srcImageInfo->format);
3549         src_slice_pitch_lod = src_row_pitch_lod * src_height_lod;
3550     }
3551 
3552     if (dstImageInfo->num_mip_levels > 1)
3553     {
3554         size_t dst_width_lod = 1 /*dstImageInfo->width*/;
3555         size_t dst_height_lod = 1 /*dstImageInfo->height*/;
3556         size_t dst_depth_lod = 1 /*dstImageInfo->depth*/;
3557         switch (dstImageInfo->type)
3558         {
3559             case CL_MEM_OBJECT_IMAGE1D:
3560                 dst_lod = destPos[1];
3561                 destPos_lod[1] = destPos_lod[2] = 0;
3562                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3563                     ? (dstImageInfo->width >> dst_lod)
3564                     : 1;
3565                 break;
3566             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3567             case CL_MEM_OBJECT_IMAGE2D:
3568                 dst_lod = destPos[2];
3569                 destPos_lod[1] = destPos[1];
3570                 destPos_lod[2] = 0;
3571                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3572                     ? (dstImageInfo->width >> dst_lod)
3573                     : 1;
3574                 if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
3575                     dst_height_lod = (dstImageInfo->height >> dst_lod)
3576                         ? (dstImageInfo->height >> dst_lod)
3577                         : 1;
3578                 break;
3579             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3580             case CL_MEM_OBJECT_IMAGE3D:
3581                 dst_lod = destPos[3];
3582                 destPos_lod[1] = destPos[1];
3583                 destPos_lod[2] = destPos[2];
3584                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3585                     ? (dstImageInfo->width >> dst_lod)
3586                     : 1;
3587                 dst_height_lod = (dstImageInfo->height >> dst_lod)
3588                     ? (dstImageInfo->height >> dst_lod)
3589                     : 1;
3590                 if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
3591                     dst_depth_lod = (dstImageInfo->depth >> dst_lod)
3592                         ? (dstImageInfo->depth >> dst_lod)
3593                         : 1;
3594                 break;
3595         }
3596         dst_mip_level_offset = compute_mip_level_offset(dstImageInfo, dst_lod);
3597         dst_row_pitch_lod =
3598             dst_width_lod * get_pixel_size(dstImageInfo->format);
3599         dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod;
3600     }
3601 
3602     // Get initial pointers
3603     char *sourcePtr = (char *)imageValues
3604         + sourcePos_lod[2] * src_slice_pitch_lod
3605         + sourcePos_lod[1] * src_row_pitch_lod + pixelSize * sourcePos_lod[0]
3606         + src_mip_level_offset;
3607     char *destPtr = (char *)destImageValues
3608         + destPos_lod[2] * dst_slice_pitch_lod
3609         + destPos_lod[1] * dst_row_pitch_lod + pixelSize * destPos_lod[0]
3610         + dst_mip_level_offset;
3611 
3612     for (size_t z = 0; z < (regionSize[2] > 0 ? regionSize[2] : 1); z++)
3613     {
3614         char *rowSourcePtr = sourcePtr;
3615         char *rowDestPtr = destPtr;
3616         for (size_t y = 0; y < regionSize[1]; y++)
3617         {
3618             memcpy(rowDestPtr, rowSourcePtr, pixelSize * regionSize[0]);
3619             rowSourcePtr += src_row_pitch_lod;
3620             rowDestPtr += dst_row_pitch_lod;
3621         }
3622 
3623         sourcePtr += src_slice_pitch_lod;
3624         destPtr += dst_slice_pitch_lod;
3625     }
3626 }
3627 
random_float(float low,float high,MTdata d)3628 float random_float(float low, float high, MTdata d)
3629 {
3630     float t = (float)genrand_real1(d);
3631     return (1.0f - t) * low + t * high;
3632 }
3633 
CoordWalker(void * coords,bool useFloats,size_t vecSize)3634 CoordWalker::CoordWalker(void *coords, bool useFloats, size_t vecSize)
3635 {
3636     if (useFloats)
3637     {
3638         mFloatCoords = (cl_float *)coords;
3639         mIntCoords = NULL;
3640     }
3641     else
3642     {
3643         mFloatCoords = NULL;
3644         mIntCoords = (cl_int *)coords;
3645     }
3646     mVecSize = vecSize;
3647 }
3648 
~CoordWalker()3649 CoordWalker::~CoordWalker() {}
3650 
Get(size_t idx,size_t el)3651 cl_float CoordWalker::Get(size_t idx, size_t el)
3652 {
3653     if (mIntCoords != NULL)
3654         return (cl_float)mIntCoords[idx * mVecSize + el];
3655     else
3656         return mFloatCoords[idx * mVecSize + el];
3657 }
3658 
3659 
print_read_header(const cl_image_format * format,image_sampler_data * sampler,bool err,int t)3660 void print_read_header(const cl_image_format *format,
3661                        image_sampler_data *sampler, bool err, int t)
3662 {
3663     const char *addressMode = NULL;
3664     const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };
3665 
3666     if (sampler->addressing_mode == CL_ADDRESS_CLAMP)
3667         addressMode = "CL_ADDRESS_CLAMP";
3668     else if (sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
3669         addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
3670     else if (sampler->addressing_mode == CL_ADDRESS_REPEAT)
3671         addressMode = "CL_ADDRESS_REPEAT";
3672     else if (sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
3673         addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
3674     else
3675         addressMode = "CL_ADDRESS_NONE";
3676 
3677     if (t)
3678     {
3679         if (err)
3680             log_error("[%-7s %-24s %d] - %s - %s - %s - %s\n",
3681                       GetChannelOrderName(format->image_channel_order),
3682                       GetChannelTypeName(format->image_channel_data_type),
3683                       (int)get_format_channel_count(format),
3684                       sampler->filter_mode == CL_FILTER_NEAREST
3685                           ? "CL_FILTER_NEAREST"
3686                           : "CL_FILTER_LINEAR",
3687                       addressMode,
3688                       normalizedNames[sampler->normalized_coords ? 1 : 0],
3689                       t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
3690         else
3691             log_info("[%-7s %-24s %d] - %s - %s - %s - %s\n",
3692                      GetChannelOrderName(format->image_channel_order),
3693                      GetChannelTypeName(format->image_channel_data_type),
3694                      (int)get_format_channel_count(format),
3695                      sampler->filter_mode == CL_FILTER_NEAREST
3696                          ? "CL_FILTER_NEAREST"
3697                          : "CL_FILTER_LINEAR",
3698                      addressMode,
3699                      normalizedNames[sampler->normalized_coords ? 1 : 0],
3700                      t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
3701     }
3702     else
3703     {
3704         if (err)
3705             log_error("[%-7s %-24s %d] - %s - %s - %s\n",
3706                       GetChannelOrderName(format->image_channel_order),
3707                       GetChannelTypeName(format->image_channel_data_type),
3708                       (int)get_format_channel_count(format),
3709                       sampler->filter_mode == CL_FILTER_NEAREST
3710                           ? "CL_FILTER_NEAREST"
3711                           : "CL_FILTER_LINEAR",
3712                       addressMode,
3713                       normalizedNames[sampler->normalized_coords ? 1 : 0]);
3714         else
3715             log_info("[%-7s %-24s %d] - %s - %s - %s\n",
3716                      GetChannelOrderName(format->image_channel_order),
3717                      GetChannelTypeName(format->image_channel_data_type),
3718                      (int)get_format_channel_count(format),
3719                      sampler->filter_mode == CL_FILTER_NEAREST
3720                          ? "CL_FILTER_NEAREST"
3721                          : "CL_FILTER_LINEAR",
3722                      addressMode,
3723                      normalizedNames[sampler->normalized_coords ? 1 : 0]);
3724     }
3725 }
3726 
print_write_header(const cl_image_format * format,bool err=false)3727 void print_write_header(const cl_image_format *format, bool err = false)
3728 {
3729     if (err)
3730         log_error("[%-7s %-24s %d]\n",
3731                   GetChannelOrderName(format->image_channel_order),
3732                   GetChannelTypeName(format->image_channel_data_type),
3733                   (int)get_format_channel_count(format));
3734     else
3735         log_info("[%-7s %-24s %d]\n",
3736                  GetChannelOrderName(format->image_channel_order),
3737                  GetChannelTypeName(format->image_channel_data_type),
3738                  (int)get_format_channel_count(format));
3739 }
3740 
3741 
print_header(const cl_image_format * format,bool err=false)3742 void print_header(const cl_image_format *format, bool err = false)
3743 {
3744     if (err)
3745     {
3746         log_error("[%-7s %-24s %d]\n",
3747                   GetChannelOrderName(format->image_channel_order),
3748                   GetChannelTypeName(format->image_channel_data_type),
3749                   (int)get_format_channel_count(format));
3750     }
3751     else
3752     {
3753         log_info("[%-7s %-24s %d]\n",
3754                  GetChannelOrderName(format->image_channel_order),
3755                  GetChannelTypeName(format->image_channel_data_type),
3756                  (int)get_format_channel_count(format));
3757     }
3758 }
3759 
find_format(cl_image_format * formatList,unsigned int numFormats,cl_image_format * formatToFind)3760 bool find_format(cl_image_format *formatList, unsigned int numFormats,
3761                  cl_image_format *formatToFind)
3762 {
3763     for (unsigned int i = 0; i < numFormats; i++)
3764     {
3765         if (formatList[i].image_channel_order
3766                 == formatToFind->image_channel_order
3767             && formatList[i].image_channel_data_type
3768                 == formatToFind->image_channel_data_type)
3769             return true;
3770     }
3771     return false;
3772 }
3773 
build_required_image_formats(cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device,std::vector<cl_image_format> & formatsToSupport)3774 void build_required_image_formats(
3775     cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device,
3776     std::vector<cl_image_format> &formatsToSupport)
3777 {
3778     formatsToSupport.clear();
3779 
3780     // Minimum list of supported image formats for reading or writing (embedded
3781     // profile)
3782     static std::vector<cl_image_format> embeddedProfile_readOrWrite{
3783         // clang-format off
3784         { CL_RGBA, CL_UNORM_INT8 },
3785         { CL_RGBA, CL_UNORM_INT16 },
3786         { CL_RGBA, CL_SIGNED_INT8 },
3787         { CL_RGBA, CL_SIGNED_INT16 },
3788         { CL_RGBA, CL_SIGNED_INT32 },
3789         { CL_RGBA, CL_UNSIGNED_INT8 },
3790         { CL_RGBA, CL_UNSIGNED_INT16 },
3791         { CL_RGBA, CL_UNSIGNED_INT32 },
3792         { CL_RGBA, CL_HALF_FLOAT },
3793         { CL_RGBA, CL_FLOAT },
3794         // clang-format on
3795     };
3796 
3797     // Minimum list of required image formats for reading or writing
3798     // num_channels, for all image types.
3799     static std::vector<cl_image_format> fullProfile_readOrWrite{
3800         // clang-format off
3801         { CL_RGBA, CL_UNORM_INT8 },
3802         { CL_RGBA, CL_UNORM_INT16 },
3803         { CL_RGBA, CL_SIGNED_INT8 },
3804         { CL_RGBA, CL_SIGNED_INT16 },
3805         { CL_RGBA, CL_SIGNED_INT32 },
3806         { CL_RGBA, CL_UNSIGNED_INT8 },
3807         { CL_RGBA, CL_UNSIGNED_INT16 },
3808         { CL_RGBA, CL_UNSIGNED_INT32 },
3809         { CL_RGBA, CL_HALF_FLOAT },
3810         { CL_RGBA, CL_FLOAT },
3811         { CL_BGRA, CL_UNORM_INT8 },
3812         // clang-format on
3813     };
3814 
3815     // Minimum list of supported image formats for reading or writing
3816     // (OpenCL 2.0, 2.1, or 2.2), for all image types.
3817     static std::vector<cl_image_format> fullProfile_2x_readOrWrite{
3818         // clang-format off
3819         { CL_R, CL_UNORM_INT8 },
3820         { CL_R, CL_UNORM_INT16 },
3821         { CL_R, CL_SNORM_INT8 },
3822         { CL_R, CL_SNORM_INT16 },
3823         { CL_R, CL_SIGNED_INT8 },
3824         { CL_R, CL_SIGNED_INT16 },
3825         { CL_R, CL_SIGNED_INT32 },
3826         { CL_R, CL_UNSIGNED_INT8 },
3827         { CL_R, CL_UNSIGNED_INT16 },
3828         { CL_R, CL_UNSIGNED_INT32 },
3829         { CL_R, CL_HALF_FLOAT },
3830         { CL_R, CL_FLOAT },
3831         { CL_RG, CL_UNORM_INT8 },
3832         { CL_RG, CL_UNORM_INT16 },
3833         { CL_RG, CL_SNORM_INT8 },
3834         { CL_RG, CL_SNORM_INT16 },
3835         { CL_RG, CL_SIGNED_INT8 },
3836         { CL_RG, CL_SIGNED_INT16 },
3837         { CL_RG, CL_SIGNED_INT32 },
3838         { CL_RG, CL_UNSIGNED_INT8 },
3839         { CL_RG, CL_UNSIGNED_INT16 },
3840         { CL_RG, CL_UNSIGNED_INT32 },
3841         { CL_RG, CL_HALF_FLOAT },
3842         { CL_RG, CL_FLOAT },
3843         { CL_RGBA, CL_UNORM_INT8 },
3844         { CL_RGBA, CL_UNORM_INT16 },
3845         { CL_RGBA, CL_SNORM_INT8 },
3846         { CL_RGBA, CL_SNORM_INT16 },
3847         { CL_RGBA, CL_SIGNED_INT8 },
3848         { CL_RGBA, CL_SIGNED_INT16 },
3849         { CL_RGBA, CL_SIGNED_INT32 },
3850         { CL_RGBA, CL_UNSIGNED_INT8 },
3851         { CL_RGBA, CL_UNSIGNED_INT16 },
3852         { CL_RGBA, CL_UNSIGNED_INT32 },
3853         { CL_RGBA, CL_HALF_FLOAT },
3854         { CL_RGBA, CL_FLOAT },
3855         { CL_BGRA, CL_UNORM_INT8 },
3856         // clang-format on
3857     };
3858 
3859     // Conditional addition to the 2x readOrWrite table:
3860     // Support for the CL_DEPTH image channel order is required only for 2D
3861     // images and 2D image arrays.
3862     static std::vector<cl_image_format> fullProfile_2x_readOrWrite_Depth{
3863         // clang-format off
3864         { CL_DEPTH, CL_UNORM_INT16 },
3865         { CL_DEPTH, CL_FLOAT },
3866         // clang-format on
3867     };
3868 
3869     // Conditional addition to the 2x readOrWrite table:
3870     // Support for reading from the CL_sRGBA image channel order is optional for
3871     // 1D image buffers. Support for writing to the CL_sRGBA image channel order
3872     // is optional for all image types.
3873     static std::vector<cl_image_format> fullProfile_2x_readOrWrite_srgb{
3874         { CL_sRGBA, CL_UNORM_INT8 },
3875     };
3876 
3877     // Minimum list of required image formats for reading and writing.
3878     static std::vector<cl_image_format> fullProfile_readAndWrite{
3879         // clang-format off
3880         { CL_R, CL_UNORM_INT8 },
3881         { CL_R, CL_SIGNED_INT8 },
3882         { CL_R, CL_SIGNED_INT16 },
3883         { CL_R, CL_SIGNED_INT32 },
3884         { CL_R, CL_UNSIGNED_INT8 },
3885         { CL_R, CL_UNSIGNED_INT16 },
3886         { CL_R, CL_UNSIGNED_INT32 },
3887         { CL_R, CL_HALF_FLOAT },
3888         { CL_R, CL_FLOAT },
3889         { CL_RGBA, CL_UNORM_INT8 },
3890         { CL_RGBA, CL_SIGNED_INT8 },
3891         { CL_RGBA, CL_SIGNED_INT16 },
3892         { CL_RGBA, CL_SIGNED_INT32 },
3893         { CL_RGBA, CL_UNSIGNED_INT8 },
3894         { CL_RGBA, CL_UNSIGNED_INT16 },
3895         { CL_RGBA, CL_UNSIGNED_INT32 },
3896         { CL_RGBA, CL_HALF_FLOAT },
3897         { CL_RGBA, CL_FLOAT },
3898         // clang-format on
3899     };
3900 
3901     // Embedded profile
3902     if (gIsEmbedded)
3903     {
3904         copy(embeddedProfile_readOrWrite.begin(),
3905              embeddedProfile_readOrWrite.end(),
3906              back_inserter(formatsToSupport));
3907     }
3908     // Full profile
3909     else
3910     {
3911         Version version = get_device_cl_version(device);
3912         if (version < Version(2, 0) || version >= Version(3, 0))
3913         {
3914             // Full profile, OpenCL 1.2 or 3.0.
3915             if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
3916             {
3917                 // Note: assumes that read-write images are supported!
3918                 copy(fullProfile_readAndWrite.begin(),
3919                      fullProfile_readAndWrite.end(),
3920                      back_inserter(formatsToSupport));
3921             }
3922             else
3923             {
3924                 copy(fullProfile_readOrWrite.begin(),
3925                      fullProfile_readOrWrite.end(),
3926                      back_inserter(formatsToSupport));
3927             }
3928         }
3929         else
3930         {
3931             // Full profile, OpenCL 2.0, 2.1, 2.2.
3932             if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
3933             {
3934                 copy(fullProfile_readAndWrite.begin(),
3935                      fullProfile_readAndWrite.end(),
3936                      back_inserter(formatsToSupport));
3937             }
3938             else
3939             {
3940                 copy(fullProfile_2x_readOrWrite.begin(),
3941                      fullProfile_2x_readOrWrite.end(),
3942                      back_inserter(formatsToSupport));
3943 
3944                 // Support for the CL_DEPTH image channel order is required only
3945                 // for 2D images and 2D image arrays.
3946                 if (image_type == CL_MEM_OBJECT_IMAGE2D
3947                     || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
3948                 {
3949                     copy(fullProfile_2x_readOrWrite_Depth.begin(),
3950                          fullProfile_2x_readOrWrite_Depth.end(),
3951                          back_inserter(formatsToSupport));
3952                 }
3953 
3954                 // Support for reading from the CL_sRGBA image channel order is
3955                 // optional for 1D image buffers. Support for writing to the
3956                 // CL_sRGBA image channel order is optional for all image types.
3957                 if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER
3958                     && flags == CL_MEM_READ_ONLY)
3959                 {
3960                     copy(fullProfile_2x_readOrWrite_srgb.begin(),
3961                          fullProfile_2x_readOrWrite_srgb.end(),
3962                          back_inserter(formatsToSupport));
3963                 }
3964             }
3965         }
3966     }
3967 }
3968 
is_image_format_required(cl_image_format format,cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device)3969 bool is_image_format_required(cl_image_format format, cl_mem_flags flags,
3970                               cl_mem_object_type image_type,
3971                               cl_device_id device)
3972 {
3973     std::vector<cl_image_format> formatsToSupport;
3974     build_required_image_formats(flags, image_type, device, formatsToSupport);
3975 
3976     for (auto &formatItr : formatsToSupport)
3977     {
3978         if (formatItr.image_channel_order == format.image_channel_order
3979             && formatItr.image_channel_data_type
3980                 == format.image_channel_data_type)
3981         {
3982             return true;
3983         }
3984     }
3985 
3986     return false;
3987 }
3988 
compute_max_mip_levels(size_t width,size_t height,size_t depth)3989 cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth)
3990 {
3991     cl_uint retMaxMipLevels = 0, max_dim = 0;
3992 
3993     max_dim = width;
3994     max_dim = height > max_dim ? height : max_dim;
3995     max_dim = depth > max_dim ? depth : max_dim;
3996 
3997     while (max_dim)
3998     {
3999         retMaxMipLevels++;
4000         max_dim >>= 1;
4001     }
4002     return retMaxMipLevels;
4003 }
4004 
compute_mipmapped_image_size(image_descriptor imageInfo)4005 cl_ulong compute_mipmapped_image_size(image_descriptor imageInfo)
4006 {
4007     cl_ulong retSize = 0;
4008     size_t curr_width, curr_height, curr_depth, curr_array_size;
4009     curr_width = imageInfo.width;
4010     curr_height = imageInfo.height;
4011     curr_depth = imageInfo.depth;
4012     curr_array_size = imageInfo.arraySize;
4013 
4014     for (int i = 0; i < (int)imageInfo.num_mip_levels; i++)
4015     {
4016         switch (imageInfo.type)
4017         {
4018             case CL_MEM_OBJECT_IMAGE3D:
4019                 retSize += (cl_ulong)curr_width * curr_height * curr_depth
4020                     * get_pixel_size(imageInfo.format);
4021                 break;
4022             case CL_MEM_OBJECT_IMAGE2D:
4023                 retSize += (cl_ulong)curr_width * curr_height
4024                     * get_pixel_size(imageInfo.format);
4025                 break;
4026             case CL_MEM_OBJECT_IMAGE1D:
4027                 retSize +=
4028                     (cl_ulong)curr_width * get_pixel_size(imageInfo.format);
4029                 break;
4030             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4031                 retSize += (cl_ulong)curr_width * curr_array_size
4032                     * get_pixel_size(imageInfo.format);
4033                 break;
4034             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4035                 retSize += (cl_ulong)curr_width * curr_height * curr_array_size
4036                     * get_pixel_size(imageInfo.format);
4037                 break;
4038         }
4039 
4040         switch (imageInfo.type)
4041         {
4042             case CL_MEM_OBJECT_IMAGE3D:
4043                 curr_depth = curr_depth >> 1 ? curr_depth >> 1 : 1;
4044             case CL_MEM_OBJECT_IMAGE2D:
4045             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4046                 curr_height = curr_height >> 1 ? curr_height >> 1 : 1;
4047             case CL_MEM_OBJECT_IMAGE1D:
4048             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4049                 curr_width = curr_width >> 1 ? curr_width >> 1 : 1;
4050         }
4051     }
4052 
4053     return retSize;
4054 }
4055 
compute_mip_level_offset(image_descriptor * imageInfo,size_t lod)4056 size_t compute_mip_level_offset(image_descriptor *imageInfo, size_t lod)
4057 {
4058     size_t retOffset = 0;
4059     size_t width, height, depth;
4060     width = imageInfo->width;
4061     height = imageInfo->height;
4062     depth = imageInfo->depth;
4063 
4064     for (size_t i = 0; i < lod; i++)
4065     {
4066         switch (imageInfo->type)
4067         {
4068             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4069                 retOffset += (size_t)width * height * imageInfo->arraySize
4070                     * get_pixel_size(imageInfo->format);
4071                 break;
4072             case CL_MEM_OBJECT_IMAGE3D:
4073                 retOffset += (size_t)width * height * depth
4074                     * get_pixel_size(imageInfo->format);
4075                 break;
4076             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4077                 retOffset += (size_t)width * imageInfo->arraySize
4078                     * get_pixel_size(imageInfo->format);
4079                 break;
4080             case CL_MEM_OBJECT_IMAGE2D:
4081                 retOffset +=
4082                     (size_t)width * height * get_pixel_size(imageInfo->format);
4083                 break;
4084             case CL_MEM_OBJECT_IMAGE1D:
4085                 retOffset += (size_t)width * get_pixel_size(imageInfo->format);
4086                 break;
4087         }
4088 
4089         // Compute next lod dimensions
4090         switch (imageInfo->type)
4091         {
4092             case CL_MEM_OBJECT_IMAGE3D: depth = (depth >> 1) ? (depth >> 1) : 1;
4093             case CL_MEM_OBJECT_IMAGE2D:
4094             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4095                 height = (height >> 1) ? (height >> 1) : 1;
4096             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4097             case CL_MEM_OBJECT_IMAGE1D: width = (width >> 1) ? (width >> 1) : 1;
4098         }
4099     }
4100     return retOffset;
4101 }
4102 
convert_image_type_to_string(cl_mem_object_type image_type)4103 const char *convert_image_type_to_string(cl_mem_object_type image_type)
4104 {
4105     switch (image_type)
4106     {
4107         case CL_MEM_OBJECT_IMAGE1D: return "1D";
4108         case CL_MEM_OBJECT_IMAGE2D: return "2D";
4109         case CL_MEM_OBJECT_IMAGE3D: return "3D";
4110         case CL_MEM_OBJECT_IMAGE1D_ARRAY: return "1D array";
4111         case CL_MEM_OBJECT_IMAGE2D_ARRAY: return "2D array";
4112         case CL_MEM_OBJECT_IMAGE1D_BUFFER: return "1D image buffer";
4113         default: return "unrecognized object type";
4114     }
4115 }
4116