1#include <clc/clc.h> 2 3#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 4 5#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ 6 typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 7 _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 8 *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \ 9 } \ 10\ 11 _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 12 *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ 13 mem[3 * offset + 2] = vec.s2;\ 14 } \ 15\ 16 typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 17 _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 18 *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \ 19 } \ 20\ 21 typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 22 _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 23 *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \ 24 } \ 25\ 26 typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 27 _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 28 *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \ 29 } \ 30 31#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ 32 VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ 33 VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ 34 VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ 35 36VSTORE_ADDR_SPACES(char) 37VSTORE_ADDR_SPACES(uchar) 38VSTORE_ADDR_SPACES(short) 39VSTORE_ADDR_SPACES(ushort) 40VSTORE_ADDR_SPACES(int) 41VSTORE_ADDR_SPACES(uint) 42VSTORE_ADDR_SPACES(long) 43VSTORE_ADDR_SPACES(ulong) 44VSTORE_ADDR_SPACES(float) 45 46 47#ifdef cl_khr_fp64 48#pragma OPENCL EXTENSION cl_khr_fp64 : enable 49 VSTORE_ADDR_SPACES(double) 50#endif 51 52#ifdef cl_khr_fp16 53#pragma OPENCL EXTENSION cl_khr_fp16 : enable 54 VSTORE_ADDR_SPACES(half) 55#endif 56 57/* vstore_half are legal even without cl_khr_fp16 */ 58#if __clang_major__ < 6 59#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); 60#else 61#define DECLARE_HELPER(STYPE, AS, __builtin) \ 62_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \ 63{ \ 64 __builtin(s, d); \ 65} 66#endif 67 68DECLARE_HELPER(float, __private, __builtin_store_halff); 69DECLARE_HELPER(float, __global, __builtin_store_halff); 70DECLARE_HELPER(float, __local, __builtin_store_halff); 71 72#ifdef cl_khr_fp64 73DECLARE_HELPER(double, __private, __builtin_store_half); 74DECLARE_HELPER(double, __global, __builtin_store_half); 75DECLARE_HELPER(double, __local, __builtin_store_half); 76#endif 77 78#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]); 79 80#define VEC_STORE2(STYPE, AS, val, ROUNDF) \ 81 VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \ 82 VEC_STORE1(STYPE, AS, val.hi, ROUNDF) 83#define VEC_STORE3(STYPE, AS, val, ROUNDF) \ 84 VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \ 85 VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \ 86 VEC_STORE1(STYPE, AS, val.s2, ROUNDF) 87#define VEC_STORE4(STYPE, AS, val, ROUNDF) \ 88 VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \ 89 VEC_STORE2(STYPE, AS, val.hi, ROUNDF) 90#define VEC_STORE8(STYPE, AS, val, ROUNDF) \ 91 VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \ 92 VEC_STORE4(STYPE, AS, val.hi, ROUNDF) 93#define VEC_STORE16(STYPE, AS, val, ROUNDF) \ 94 VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \ 95 VEC_STORE8(STYPE, AS, val.hi, ROUNDF) 96 97#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \ 98 _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ 99 offset *= VEC_SIZE; \ 100 VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ 101 } \ 102 _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ 103 offset *= OFFSET; \ 104 VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ 105 } 106 107_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) 108{ 109 return x; 110} 111_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) 112{ 113 /* Remove lower 13 bits to make sure the number is rounded down */ 114 int mask = 0xffffe000; 115 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 116 /* Denormals cannot be flushed, and they use different bit for rounding */ 117 if (exp < -14) 118 mask <<= min(-(exp + 14), 10); 119 /* RTZ does not produce Inf for large numbers */ 120 if (fabs(x) > 65504.0f && !isinf(x)) 121 return copysign(65504.0f, x); 122 /* Handle nan corner case */ 123 if (isnan(x)) 124 return x; 125 return as_float(as_uint(x) & mask); 126} 127_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) 128{ 129 const float inf = copysign(INFINITY, x); 130 /* Set lower 13 bits */ 131 int mask = (1 << 13) - 1; 132 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 133 /* Denormals cannot be flushed, and they use different bit for rounding */ 134 if (exp < -14) 135 mask = (1 << (13 + min(-(exp + 14), 10))) - 1; 136 /* Handle nan corner case */ 137 if (isnan(x)) 138 return x; 139 const float next = nextafter(as_float(as_uint(x) | mask), inf); 140 return ((as_uint(x) & mask) == 0) ? x : next; 141} 142_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) 143{ 144 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); 145} 146_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) 147{ 148 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); 149} 150_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) 151{ 152 /* Mantisa + implicit bit */ 153 const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); 154 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 155 int shift = 13; 156 if (exp < -14) { 157 /* The default assumes lower 13 bits are rounded, 158 * but it might be more for denormals. 159 * Shifting beyond last == 0b, and qr == 00b is not necessary */ 160 shift += min(-(exp + 14), 15); 161 } 162 int mask = (1 << shift) - 1; 163 const uint grs = mantissa & mask; 164 const uint last = mantissa & (1 << shift); 165 /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. 166 * exp > 15 should round to inf. */ 167 bool roundup = (grs > (1 << (shift - 1))) || 168 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); 169 return roundup ? __clc_rti(x) : __clc_rtz(x); 170} 171 172#ifdef cl_khr_fp64 173_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) 174{ 175 return x; 176} 177_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) 178{ 179 /* Remove lower 42 bits to make sure the number is rounded down */ 180 ulong mask = 0xfffffc0000000000UL; 181 const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 182 /* Denormals cannot be flushed, and they use different bit for rounding */ 183 if (exp < -14) 184 mask <<= min(-(exp + 14), 10); 185 /* RTZ does not produce Inf for large numbers */ 186 if (fabs(x) > 65504.0 && !isinf(x)) 187 return copysign(65504.0, x); 188 /* Handle nan corner case */ 189 if (isnan(x)) 190 return x; 191 return as_double(as_ulong(x) & mask); 192} 193_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) 194{ 195 const double inf = copysign((double)INFINITY, x); 196 /* Set lower 42 bits */ 197 long mask = (1UL << 42UL) - 1UL; 198 const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 199 /* Denormals cannot be flushed, and they use different bit for rounding */ 200 if (exp < -14) 201 mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; 202 /* Handle nan corner case */ 203 if (isnan(x)) 204 return x; 205 const double next = nextafter(as_double(as_ulong(x) | mask), inf); 206 return ((as_ulong(x) & mask) == 0) ? x : next; 207} 208_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) 209{ 210 return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x); 211} 212_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) 213{ 214 return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x); 215} 216_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) 217{ 218 /* Mantisa + implicit bit */ 219 const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); 220 const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 221 int shift = 42; 222 if (exp < -14) { 223 /* The default assumes lower 13 bits are rounded, 224 * but it might be more for denormals. 225 * Shifting beyond last == 0b, and qr == 00b is not necessary */ 226 shift += min(-(exp + 14), 15); 227 } 228 ulong mask = (1UL << shift) - 1UL; 229 const ulong grs = mantissa & mask; 230 const ulong last = mantissa & (1UL << shift); 231 /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. 232 * exp > 15 should round to inf. */ 233 bool roundup = (grs > (1UL << (shift - 1UL))) || 234 (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); 235 return roundup ? __clc_rti(x) : __clc_rtz(x); 236} 237#endif 238 239#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ 240 __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \ 241 __FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \ 242 __FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \ 243 __FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \ 244 __FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte) 245 246#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ 247 __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) 248 249#define __CLC_BODY "vstore_half.inc" 250#include <clc/math/gentype.inc> 251#undef __CLC_BODY 252#undef FUNC 253#undef __XFUNC 254#undef __FUNC 255#undef VEC_LOAD16 256#undef VEC_LOAD8 257#undef VEC_LOAD4 258#undef VEC_LOAD3 259#undef VEC_LOAD2 260#undef VEC_LOAD1 261#undef DECLARE_HELPER 262#undef VSTORE_ADDR_SPACES 263#undef VSTORE_VECTORIZE 264