1#include <clc/clc.h>
2
3#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
4
5#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
6  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
7  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
8    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \
9  } \
10\
11  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
12    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
13    mem[3 * offset + 2] = vec.s2;\
14  } \
15\
16  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
17  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
18    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \
19  } \
20\
21  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
22  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
23    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \
24  } \
25\
26  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
27  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
28    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \
29  } \
30
31#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
32    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
33    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
34    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
35
36VSTORE_ADDR_SPACES(char)
37VSTORE_ADDR_SPACES(uchar)
38VSTORE_ADDR_SPACES(short)
39VSTORE_ADDR_SPACES(ushort)
40VSTORE_ADDR_SPACES(int)
41VSTORE_ADDR_SPACES(uint)
42VSTORE_ADDR_SPACES(long)
43VSTORE_ADDR_SPACES(ulong)
44VSTORE_ADDR_SPACES(float)
45
46
47#ifdef cl_khr_fp64
48#pragma OPENCL EXTENSION cl_khr_fp64 : enable
49    VSTORE_ADDR_SPACES(double)
50#endif
51
52#ifdef cl_khr_fp16
53#pragma OPENCL EXTENSION cl_khr_fp16 : enable
54    VSTORE_ADDR_SPACES(half)
55#endif
56
57/* vstore_half are legal even without cl_khr_fp16 */
58#if __clang_major__ < 6
59#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
60#else
61#define DECLARE_HELPER(STYPE, AS, __builtin) \
62_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \
63{ \
64	__builtin(s, d); \
65}
66#endif
67
68DECLARE_HELPER(float, __private, __builtin_store_halff);
69DECLARE_HELPER(float, __global, __builtin_store_halff);
70DECLARE_HELPER(float, __local, __builtin_store_halff);
71
72#ifdef cl_khr_fp64
73DECLARE_HELPER(double, __private, __builtin_store_half);
74DECLARE_HELPER(double, __global, __builtin_store_half);
75DECLARE_HELPER(double, __local, __builtin_store_half);
76#endif
77
78#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]);
79
80#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
81	VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
82	VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
83#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
84	VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
85	VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
86	VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
87#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
88	VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
89	VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
90#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
91	VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
92	VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
93#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
94	VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
95	VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
96
97#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \
98  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
99    offset *= VEC_SIZE; \
100    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
101  } \
102  _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
103    offset *= OFFSET; \
104    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
105  }
106
107_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x)
108{
109	return x;
110}
111_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x)
112{
113	/* Remove lower 13 bits to make sure the number is rounded down */
114	int mask = 0xffffe000;
115	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
116	/* Denormals cannot be flushed, and they use different bit for rounding */
117	if (exp < -14)
118		mask <<= min(-(exp + 14), 10);
119	/* RTZ does not produce Inf for large numbers */
120	if (fabs(x) > 65504.0f && !isinf(x))
121		return copysign(65504.0f, x);
122	/* Handle nan corner case */
123	if (isnan(x))
124		return x;
125	return as_float(as_uint(x) & mask);
126}
127_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x)
128{
129	const float inf = copysign(INFINITY, x);
130	/* Set lower 13 bits */
131	int mask = (1 << 13) - 1;
132	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
133	/* Denormals cannot be flushed, and they use different bit for rounding */
134	if (exp < -14)
135		mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
136	/* Handle nan corner case */
137	if (isnan(x))
138		return x;
139	const float next = nextafter(as_float(as_uint(x) | mask), inf);
140	return ((as_uint(x) & mask) == 0) ? x : next;
141}
142_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x)
143{
144	return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
145}
146_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x)
147{
148	return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
149}
150_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x)
151{
152	/* Mantisa + implicit bit */
153	const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
154	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
155	int shift = 13;
156	if (exp < -14) {
157		/* The default assumes lower 13 bits are rounded,
158		 * but it might be more for denormals.
159		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
160		shift += min(-(exp + 14), 15);
161	}
162	int mask = (1 << shift) - 1;
163	const uint grs = mantissa & mask;
164	const uint last = mantissa & (1 << shift);
165	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
166	 * exp > 15 should round to inf. */
167	bool roundup = (grs > (1 << (shift - 1))) ||
168		(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
169	return roundup ? __clc_rti(x) : __clc_rtz(x);
170}
171
172#ifdef cl_khr_fp64
173_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x)
174{
175	return x;
176}
177_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x)
178{
179	/* Remove lower 42 bits to make sure the number is rounded down */
180	ulong mask = 0xfffffc0000000000UL;
181	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
182	/* Denormals cannot be flushed, and they use different bit for rounding */
183	if (exp < -14)
184		mask <<= min(-(exp + 14), 10);
185	/* RTZ does not produce Inf for large numbers */
186	if (fabs(x) > 65504.0 && !isinf(x))
187		return copysign(65504.0, x);
188	/* Handle nan corner case */
189	if (isnan(x))
190		return x;
191	return as_double(as_ulong(x) & mask);
192}
193_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x)
194{
195	const double inf = copysign((double)INFINITY, x);
196	/* Set lower 42 bits */
197	long mask = (1UL << 42UL) - 1UL;
198	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
199	/* Denormals cannot be flushed, and they use different bit for rounding */
200	if (exp < -14)
201		mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
202	/* Handle nan corner case */
203	if (isnan(x))
204		return x;
205	const double next = nextafter(as_double(as_ulong(x) | mask), inf);
206	return ((as_ulong(x) & mask) == 0) ? x : next;
207}
208_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x)
209{
210	return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x);
211}
212_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x)
213{
214	return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x);
215}
216_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x)
217{
218	/* Mantisa + implicit bit */
219	const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
220	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
221	int shift = 42;
222	if (exp < -14) {
223		/* The default assumes lower 13 bits are rounded,
224		 * but it might be more for denormals.
225		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
226		shift += min(-(exp + 14), 15);
227	}
228	ulong mask = (1UL << shift) - 1UL;
229	const ulong grs = mantissa & mask;
230	const ulong last = mantissa & (1UL << shift);
231	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
232	 * exp > 15 should round to inf. */
233	bool roundup = (grs > (1UL << (shift - 1UL))) ||
234		(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
235	return roundup ? __clc_rti(x) : __clc_rtz(x);
236}
237#endif
238
239#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
240	__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
241	__FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
242	__FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
243	__FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
244	__FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
245
246#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
247	__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
248
249#define __CLC_BODY "vstore_half.inc"
250#include <clc/math/gentype.inc>
251#undef __CLC_BODY
252#undef FUNC
253#undef __XFUNC
254#undef __FUNC
255#undef VEC_LOAD16
256#undef VEC_LOAD8
257#undef VEC_LOAD4
258#undef VEC_LOAD3
259#undef VEC_LOAD2
260#undef VEC_LOAD1
261#undef DECLARE_HELPER
262#undef VSTORE_ADDR_SPACES
263#undef VSTORE_VECTORIZE
264