1 #include "rs_core.rsh"
2 #include "rs_f16_util.h"
3
4 extern float2 __attribute__((overloadable)) convert_float2(int2 c);
5 extern float3 __attribute__((overloadable)) convert_float3(int3 c);
6 extern float4 __attribute__((overloadable)) convert_float4(int4 c);
7
8 extern int2 __attribute__((overloadable)) convert_int2(float2 c);
9 extern int3 __attribute__((overloadable)) convert_int3(float3 c);
10 extern int4 __attribute__((overloadable)) convert_int4(float4 c);
11
12
13 extern float __attribute__((overloadable)) fmin(float v, float v2);
14 extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
15 extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
16 extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
17
18 extern float __attribute__((overloadable)) fmax(float v, float v2);
19 extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
20 extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
21 extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
22
23 // Float ops, 6.11.2
24
25 #define FN_FUNC_FN(fnc) \
26 extern float2 __attribute__((overloadable)) fnc(float2 v) { \
27 float2 r; \
28 r.x = fnc(v.x); \
29 r.y = fnc(v.y); \
30 return r; \
31 } \
32 extern float3 __attribute__((overloadable)) fnc(float3 v) { \
33 float3 r; \
34 r.x = fnc(v.x); \
35 r.y = fnc(v.y); \
36 r.z = fnc(v.z); \
37 return r; \
38 } \
39 extern float4 __attribute__((overloadable)) fnc(float4 v) { \
40 float4 r; \
41 r.x = fnc(v.x); \
42 r.y = fnc(v.y); \
43 r.z = fnc(v.z); \
44 r.w = fnc(v.w); \
45 return r; \
46 }
47
48 #define IN_FUNC_FN(fnc) \
49 extern int2 __attribute__((overloadable)) fnc(float2 v) { \
50 int2 r; \
51 r.x = fnc(v.x); \
52 r.y = fnc(v.y); \
53 return r; \
54 } \
55 extern int3 __attribute__((overloadable)) fnc(float3 v) { \
56 int3 r; \
57 r.x = fnc(v.x); \
58 r.y = fnc(v.y); \
59 r.z = fnc(v.z); \
60 return r; \
61 } \
62 extern int4 __attribute__((overloadable)) fnc(float4 v) { \
63 int4 r; \
64 r.x = fnc(v.x); \
65 r.y = fnc(v.y); \
66 r.z = fnc(v.z); \
67 r.w = fnc(v.w); \
68 return r; \
69 }
70
71 #define FN_FUNC_FN_FN(fnc) \
72 extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
73 float2 r; \
74 r.x = fnc(v1.x, v2.x); \
75 r.y = fnc(v1.y, v2.y); \
76 return r; \
77 } \
78 extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
79 float3 r; \
80 r.x = fnc(v1.x, v2.x); \
81 r.y = fnc(v1.y, v2.y); \
82 r.z = fnc(v1.z, v2.z); \
83 return r; \
84 } \
85 extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
86 float4 r; \
87 r.x = fnc(v1.x, v2.x); \
88 r.y = fnc(v1.y, v2.y); \
89 r.z = fnc(v1.z, v2.z); \
90 r.w = fnc(v1.w, v2.w); \
91 return r; \
92 }
93
94 #define FN_FUNC_FN_F(fnc) \
95 extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \
96 float2 r; \
97 r.x = fnc(v1.x, v2); \
98 r.y = fnc(v1.y, v2); \
99 return r; \
100 } \
101 extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \
102 float3 r; \
103 r.x = fnc(v1.x, v2); \
104 r.y = fnc(v1.y, v2); \
105 r.z = fnc(v1.z, v2); \
106 return r; \
107 } \
108 extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \
109 float4 r; \
110 r.x = fnc(v1.x, v2); \
111 r.y = fnc(v1.y, v2); \
112 r.z = fnc(v1.z, v2); \
113 r.w = fnc(v1.w, v2); \
114 return r; \
115 }
116
117 #define FN_FUNC_FN_IN(fnc) \
118 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \
119 float2 r; \
120 r.x = fnc(v1.x, v2.x); \
121 r.y = fnc(v1.y, v2.y); \
122 return r; \
123 } \
124 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \
125 float3 r; \
126 r.x = fnc(v1.x, v2.x); \
127 r.y = fnc(v1.y, v2.y); \
128 r.z = fnc(v1.z, v2.z); \
129 return r; \
130 } \
131 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \
132 float4 r; \
133 r.x = fnc(v1.x, v2.x); \
134 r.y = fnc(v1.y, v2.y); \
135 r.z = fnc(v1.z, v2.z); \
136 r.w = fnc(v1.w, v2.w); \
137 return r; \
138 }
139
140 #define FN_FUNC_FN_I(fnc) \
141 extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \
142 float2 r; \
143 r.x = fnc(v1.x, v2); \
144 r.y = fnc(v1.y, v2); \
145 return r; \
146 } \
147 extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \
148 float3 r; \
149 r.x = fnc(v1.x, v2); \
150 r.y = fnc(v1.y, v2); \
151 r.z = fnc(v1.z, v2); \
152 return r; \
153 } \
154 extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \
155 float4 r; \
156 r.x = fnc(v1.x, v2); \
157 r.y = fnc(v1.y, v2); \
158 r.z = fnc(v1.z, v2); \
159 r.w = fnc(v1.w, v2); \
160 return r; \
161 }
162
163 #define FN_FUNC_FN_PFN(fnc) \
164 extern float2 __attribute__((overloadable)) \
165 fnc(float2 v1, float2 *v2) { \
166 float2 r; \
167 float t[2]; \
168 r.x = fnc(v1.x, &t[0]); \
169 r.y = fnc(v1.y, &t[1]); \
170 v2->x = t[0]; \
171 v2->y = t[1]; \
172 return r; \
173 } \
174 extern float3 __attribute__((overloadable)) \
175 fnc(float3 v1, float3 *v2) { \
176 float3 r; \
177 float t[3]; \
178 r.x = fnc(v1.x, &t[0]); \
179 r.y = fnc(v1.y, &t[1]); \
180 r.z = fnc(v1.z, &t[2]); \
181 v2->x = t[0]; \
182 v2->y = t[1]; \
183 v2->z = t[2]; \
184 return r; \
185 } \
186 extern float4 __attribute__((overloadable)) \
187 fnc(float4 v1, float4 *v2) { \
188 float4 r; \
189 float t[4]; \
190 r.x = fnc(v1.x, &t[0]); \
191 r.y = fnc(v1.y, &t[1]); \
192 r.z = fnc(v1.z, &t[2]); \
193 r.w = fnc(v1.w, &t[3]); \
194 v2->x = t[0]; \
195 v2->y = t[1]; \
196 v2->z = t[2]; \
197 v2->w = t[3]; \
198 return r; \
199 }
200
201 #define FN_FUNC_FN_PIN(fnc) \
202 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \
203 float2 r; \
204 int t[2]; \
205 r.x = fnc(v1.x, &t[0]); \
206 r.y = fnc(v1.y, &t[1]); \
207 v2->x = t[0]; \
208 v2->y = t[1]; \
209 return r; \
210 } \
211 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \
212 float3 r; \
213 int t[3]; \
214 r.x = fnc(v1.x, &t[0]); \
215 r.y = fnc(v1.y, &t[1]); \
216 r.z = fnc(v1.z, &t[2]); \
217 v2->x = t[0]; \
218 v2->y = t[1]; \
219 v2->z = t[2]; \
220 return r; \
221 } \
222 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \
223 float4 r; \
224 int t[4]; \
225 r.x = fnc(v1.x, &t[0]); \
226 r.y = fnc(v1.y, &t[1]); \
227 r.z = fnc(v1.z, &t[2]); \
228 r.w = fnc(v1.w, &t[3]); \
229 v2->x = t[0]; \
230 v2->y = t[1]; \
231 v2->z = t[2]; \
232 v2->w = t[3]; \
233 return r; \
234 }
235
236 #define FN_FUNC_FN_FN_FN(fnc) \
237 extern float2 __attribute__((overloadable)) \
238 fnc(float2 v1, float2 v2, float2 v3) { \
239 float2 r; \
240 r.x = fnc(v1.x, v2.x, v3.x); \
241 r.y = fnc(v1.y, v2.y, v3.y); \
242 return r; \
243 } \
244 extern float3 __attribute__((overloadable)) \
245 fnc(float3 v1, float3 v2, float3 v3) { \
246 float3 r; \
247 r.x = fnc(v1.x, v2.x, v3.x); \
248 r.y = fnc(v1.y, v2.y, v3.y); \
249 r.z = fnc(v1.z, v2.z, v3.z); \
250 return r; \
251 } \
252 extern float4 __attribute__((overloadable)) \
253 fnc(float4 v1, float4 v2, float4 v3) { \
254 float4 r; \
255 r.x = fnc(v1.x, v2.x, v3.x); \
256 r.y = fnc(v1.y, v2.y, v3.y); \
257 r.z = fnc(v1.z, v2.z, v3.z); \
258 r.w = fnc(v1.w, v2.w, v3.w); \
259 return r; \
260 }
261
262 #define FN_FUNC_FN_FN_PIN(fnc) \
263 extern float2 __attribute__((overloadable)) \
264 fnc(float2 v1, float2 v2, int2 *v3) { \
265 float2 r; \
266 int t[2]; \
267 r.x = fnc(v1.x, v2.x, &t[0]); \
268 r.y = fnc(v1.y, v2.y, &t[1]); \
269 v3->x = t[0]; \
270 v3->y = t[1]; \
271 return r; \
272 } \
273 extern float3 __attribute__((overloadable)) \
274 fnc(float3 v1, float3 v2, int3 *v3) { \
275 float3 r; \
276 int t[3]; \
277 r.x = fnc(v1.x, v2.x, &t[0]); \
278 r.y = fnc(v1.y, v2.y, &t[1]); \
279 r.z = fnc(v1.z, v2.z, &t[2]); \
280 v3->x = t[0]; \
281 v3->y = t[1]; \
282 v3->z = t[2]; \
283 return r; \
284 } \
285 extern float4 __attribute__((overloadable)) \
286 fnc(float4 v1, float4 v2, int4 *v3) { \
287 float4 r; \
288 int t[4]; \
289 r.x = fnc(v1.x, v2.x, &t[0]); \
290 r.y = fnc(v1.y, v2.y, &t[1]); \
291 r.z = fnc(v1.z, v2.z, &t[2]); \
292 r.w = fnc(v1.w, v2.w, &t[3]); \
293 v3->x = t[0]; \
294 v3->y = t[1]; \
295 v3->z = t[2]; \
296 v3->w = t[3]; \
297 return r; \
298 }
299
300 static const int iposinf = 0x7f800000;
301 static const int ineginf = 0xff800000;
302
posinf()303 static const float posinf() {
304 float f = *((float*)&iposinf);
305 return f;
306 }
307
neginf()308 static const float neginf() {
309 float f = *((float*)&ineginf);
310 return f;
311 }
312
isinf(float f)313 static bool isinf(float f) {
314 int i = *((int*)(void*)&f);
315 return (i == iposinf) || (i == ineginf);
316 }
317
isnan(float f)318 static bool isnan(float f) {
319 int i = *((int*)(void*)&f);
320 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
321 }
322
isposzero(float f)323 static bool isposzero(float f) {
324 int i = *((int*)(void*)&f);
325 return (i == 0x00000000);
326 }
327
isnegzero(float f)328 static bool isnegzero(float f) {
329 int i = *((int*)(void*)&f);
330 return (i == 0x80000000);
331 }
332
iszero(float f)333 static bool iszero(float f) {
334 return isposzero(f) || isnegzero(f);
335 }
336
337
338 extern float __attribute__((overloadable)) SC_acosf(float);
acos(float v)339 float __attribute__((overloadable)) acos(float v) {
340 return SC_acosf(v);
341 }
342 FN_FUNC_FN(acos)
343
344 extern float __attribute__((overloadable)) SC_acoshf(float);
acosh(float v)345 float __attribute__((overloadable)) acosh(float v) {
346 return SC_acoshf(v);
347 }
FN_FUNC_FN(acosh)348 FN_FUNC_FN(acosh)
349
350
351 extern float __attribute__((overloadable)) acospi(float v) {
352 return acos(v) / M_PI;
353 }
354 FN_FUNC_FN(acospi)
355
356 extern float __attribute__((overloadable)) SC_asinf(float);
asin(float v)357 float __attribute__((overloadable)) asin(float v) {
358 return SC_asinf(v);
359 }
360 FN_FUNC_FN(asin)
361
362 extern float __attribute__((overloadable)) SC_asinhf(float);
asinh(float v)363 float __attribute__((overloadable)) asinh(float v) {
364 return SC_asinhf(v);
365 }
FN_FUNC_FN(asinh)366 FN_FUNC_FN(asinh)
367
368 extern float __attribute__((overloadable)) asinpi(float v) {
369 return asin(v) / M_PI;
370 }
371 FN_FUNC_FN(asinpi)
372
373 extern float __attribute__((overloadable)) SC_atanf(float);
atan(float v)374 float __attribute__((overloadable)) atan(float v) {
375 return SC_atanf(v);
376 }
377 FN_FUNC_FN(atan)
378
379 extern float __attribute__((overloadable)) SC_atan2f(float, float);
atan2(float v1,float v2)380 float __attribute__((overloadable)) atan2(float v1, float v2) {
381 return SC_atan2f(v1, v2);
382 }
383 FN_FUNC_FN_FN(atan2)
384
385 extern float __attribute__((overloadable)) SC_atanhf(float);
atanh(float v)386 float __attribute__((overloadable)) atanh(float v) {
387 return SC_atanhf(v);
388 }
FN_FUNC_FN(atanh)389 FN_FUNC_FN(atanh)
390
391 extern float __attribute__((overloadable)) atanpi(float v) {
392 return atan(v) / M_PI;
393 }
FN_FUNC_FN(atanpi)394 FN_FUNC_FN(atanpi)
395
396
397 extern float __attribute__((overloadable)) atan2pi(float y, float x) {
398 return atan2(y, x) / M_PI;
399 }
400 FN_FUNC_FN_FN(atan2pi)
401
402 extern float __attribute__((overloadable)) SC_cbrtf(float);
cbrt(float v)403 float __attribute__((overloadable)) cbrt(float v) {
404 return SC_cbrtf(v);
405 }
406 FN_FUNC_FN(cbrt)
407
408 extern float __attribute__((overloadable)) SC_ceilf(float);
ceil(float v)409 float __attribute__((overloadable)) ceil(float v) {
410 return SC_ceilf(v);
411 }
412 FN_FUNC_FN(ceil)
413
414 extern float __attribute__((overloadable)) SC_copysignf(float, float);
copysign(float v1,float v2)415 float __attribute__((overloadable)) copysign(float v1, float v2) {
416 return SC_copysignf(v1, v2);
417 }
418 FN_FUNC_FN_FN(copysign)
419
420 extern float __attribute__((overloadable)) SC_cosf(float);
cos(float v)421 float __attribute__((overloadable)) cos(float v) {
422 return SC_cosf(v);
423 }
424 FN_FUNC_FN(cos)
425
426 extern float __attribute__((overloadable)) SC_coshf(float);
cosh(float v)427 float __attribute__((overloadable)) cosh(float v) {
428 return SC_coshf(v);
429 }
FN_FUNC_FN(cosh)430 FN_FUNC_FN(cosh)
431
432 extern float __attribute__((overloadable)) cospi(float v) {
433 return cos(v * M_PI);
434 }
435 FN_FUNC_FN(cospi)
436
437 extern float __attribute__((overloadable)) SC_erfcf(float);
erfc(float v)438 float __attribute__((overloadable)) erfc(float v) {
439 return SC_erfcf(v);
440 }
441 FN_FUNC_FN(erfc)
442
443 extern float __attribute__((overloadable)) SC_erff(float);
erf(float v)444 float __attribute__((overloadable)) erf(float v) {
445 return SC_erff(v);
446 }
447 FN_FUNC_FN(erf)
448
449 extern float __attribute__((overloadable)) SC_expf(float);
exp(float v)450 float __attribute__((overloadable)) exp(float v) {
451 return SC_expf(v);
452 }
453 FN_FUNC_FN(exp)
454
455 extern float __attribute__((overloadable)) SC_exp2f(float);
exp2(float v)456 float __attribute__((overloadable)) exp2(float v) {
457 return SC_exp2f(v);
458 }
459 FN_FUNC_FN(exp2)
460
461 extern float __attribute__((overloadable)) pow(float, float);
462
exp10(float v)463 extern float __attribute__((overloadable)) exp10(float v) {
464 return exp2(v * 3.321928095f);
465 }
466 FN_FUNC_FN(exp10)
467
468 extern float __attribute__((overloadable)) SC_expm1f(float);
expm1(float v)469 float __attribute__((overloadable)) expm1(float v) {
470 return SC_expm1f(v);
471 }
FN_FUNC_FN(expm1)472 FN_FUNC_FN(expm1)
473
474 extern float __attribute__((overloadable)) fabs(float v) {
475 int i = *((int*)(void*)&v) & 0x7fffffff;
476 return *((float*)(void*)&i);
477 }
478 FN_FUNC_FN(fabs)
479
480 extern float __attribute__((overloadable)) SC_fdimf(float, float);
fdim(float v1,float v2)481 float __attribute__((overloadable)) fdim(float v1, float v2) {
482 return SC_fdimf(v1, v2);
483 }
484 FN_FUNC_FN_FN(fdim)
485
486 extern float __attribute__((overloadable)) SC_floorf(float);
floor(float v)487 float __attribute__((overloadable)) floor(float v) {
488 return SC_floorf(v);
489 }
490 FN_FUNC_FN(floor)
491
492 extern float __attribute__((overloadable)) SC_fmaf(float, float, float);
fma(float v1,float v2,float v3)493 float __attribute__((overloadable)) fma(float v1, float v2, float v3) {
494 return SC_fmaf(v1, v2, v3);
495 }
496 FN_FUNC_FN_FN_FN(fma)
497
498 extern float __attribute__((overloadable)) SC_fminf(float, float);
499
500 extern float __attribute__((overloadable)) SC_fmodf(float, float);
fmod(float v1,float v2)501 float __attribute__((overloadable)) fmod(float v1, float v2) {
502 return SC_fmodf(v1, v2);
503 }
FN_FUNC_FN_FN(fmod)504 FN_FUNC_FN_FN(fmod)
505
506 extern float __attribute__((overloadable)) fract(float v, float *iptr) {
507 int i = (int)floor(v);
508 if (iptr) {
509 iptr[0] = i;
510 }
511 return fmin(v - i, 0x1.fffffep-1f);
512 }
FN_FUNC_FN_PFN(fract)513 FN_FUNC_FN_PFN(fract)
514
515 extern float __attribute__((const, overloadable)) fract(float v) {
516 float unused;
517 return fract(v, &unused);
518 }
519 FN_FUNC_FN(fract)
520
521 extern float __attribute__((overloadable)) SC_frexpf(float, int *);
frexp(float v1,int * v2)522 float __attribute__((overloadable)) frexp(float v1, int* v2) {
523 return SC_frexpf(v1, v2);
524 }
525 FN_FUNC_FN_PIN(frexp)
526
527 extern float __attribute__((overloadable)) SC_hypotf(float, float);
hypot(float v1,float v2)528 float __attribute__((overloadable)) hypot(float v1, float v2) {
529 return SC_hypotf(v1, v2);
530 }
531 FN_FUNC_FN_FN(hypot)
532
533 extern int __attribute__((overloadable)) SC_ilogbf(float);
ilogb(float v)534 int __attribute__((overloadable)) ilogb(float v) {
535 return SC_ilogbf(v);
536 }
537 IN_FUNC_FN(ilogb)
538
539 extern float __attribute__((overloadable)) SC_ldexpf(float, int);
ldexp(float v1,int v2)540 float __attribute__((overloadable)) ldexp(float v1, int v2) {
541 return SC_ldexpf(v1, v2);
542 }
543 FN_FUNC_FN_IN(ldexp)
544 FN_FUNC_FN_I(ldexp)
545
546 extern float __attribute__((overloadable)) SC_lgammaf(float);
lgamma(float v)547 float __attribute__((overloadable)) lgamma(float v) {
548 return SC_lgammaf(v);
549 }
550 FN_FUNC_FN(lgamma)
551 extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*);
lgamma(float v,int * ptr)552 float __attribute__((overloadable)) lgamma(float v, int* ptr) {
553 return SC_lgammaf_r(v, ptr);
554 }
555 FN_FUNC_FN_PIN(lgamma)
556
557 extern float __attribute__((overloadable)) SC_logf(float);
log(float v)558 float __attribute__((overloadable)) log(float v) {
559 return SC_logf(v);
560 }
561 FN_FUNC_FN(log)
562
563 extern float __attribute__((overloadable)) SC_log10f(float);
log10(float v)564 float __attribute__((overloadable)) log10(float v) {
565 return SC_log10f(v);
566 }
FN_FUNC_FN(log10)567 FN_FUNC_FN(log10)
568
569
570 extern float __attribute__((overloadable)) log2(float v) {
571 return log10(v) * 3.321928095f;
572 }
573 FN_FUNC_FN(log2)
574
575 extern float __attribute__((overloadable)) SC_log1pf(float);
log1p(float v)576 float __attribute__((overloadable)) log1p(float v) {
577 return SC_log1pf(v);
578 }
579 FN_FUNC_FN(log1p)
580
581 extern float __attribute__((overloadable)) SC_logbf(float);
logb(float v)582 float __attribute__((overloadable)) logb(float v) {
583 return SC_logbf(v);
584 }
FN_FUNC_FN(logb)585 FN_FUNC_FN(logb)
586
587 extern float __attribute__((overloadable)) mad(float a, float b, float c) {
588 return a * b + c;
589 }
mad(float2 a,float2 b,float2 c)590 extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
591 return a * b + c;
592 }
mad(float3 a,float3 b,float3 c)593 extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
594 return a * b + c;
595 }
mad(float4 a,float4 b,float4 c)596 extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
597 return a * b + c;
598 }
599
600 extern float __attribute__((overloadable)) SC_modff(float, float *);
modf(float v1,float * v2)601 float __attribute__((overloadable)) modf(float v1, float *v2) {
602 return SC_modff(v1, v2);
603 }
604 FN_FUNC_FN_PFN(modf);
605
nan(uint v)606 extern float __attribute__((overloadable)) nan(uint v) {
607 float f[1];
608 uint32_t *ip = (uint32_t *)f;
609 *ip = v | 0x7fc00000;
610 return f[0];
611 }
612
613 extern float __attribute__((overloadable)) SC_nextafterf(float, float);
nextafter(float v1,float v2)614 float __attribute__((overloadable)) nextafter(float v1, float v2) {
615 return SC_nextafterf(v1, v2);
616 }
617 FN_FUNC_FN_FN(nextafter)
618
619 // This function must be defined here if we're compiling with debug info
620 // (libclcore_g.bc), because we need a C source to get debug information.
621 // Otherwise the implementation can be found in IR.
622 #if defined(RS_G_RUNTIME)
623 extern float __attribute__((overloadable)) SC_powf(float, float);
pow(float v1,float v2)624 float __attribute__((overloadable)) pow(float v1, float v2) {
625 return SC_powf(v1, v2);
626 }
627 #endif // defined(RS_G_RUNTIME)
FN_FUNC_FN_FN(pow)628 FN_FUNC_FN_FN(pow)
629
630 extern float __attribute__((overloadable)) pown(float v, int p) {
631 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31).
632 * For very large ints, we'll lose whether the exponent is even or odd, making
633 * the selection of a correct sign incorrect. We correct this. Use copysign
634 * to handle the negative zero case.
635 */
636 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f;
637 float f = pow(v, (float)p);
638 return copysign(f, sign);
639 }
FN_FUNC_FN_IN(pown)640 FN_FUNC_FN_IN(pown)
641
642 extern float __attribute__((overloadable)) powr(float v, float p) {
643 return pow(v, p);
644 }
powr(float2 v,float2 p)645 extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
646 return pow(v, p);
647 }
powr(float3 v,float3 p)648 extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
649 return pow(v, p);
650 }
powr(float4 v,float4 p)651 extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
652 return pow(v, p);
653 }
654
655 extern float __attribute__((overloadable)) SC_remainderf(float, float);
remainder(float v1,float v2)656 float __attribute__((overloadable)) remainder(float v1, float v2) {
657 return SC_remainderf(v1, v2);
658 }
659 FN_FUNC_FN_FN(remainder)
660
661 extern float __attribute__((overloadable)) SC_remquof(float, float, int *);
remquo(float v1,float v2,int * v3)662 float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) {
663 return SC_remquof(v1, v2, v3);
664 }
665 FN_FUNC_FN_FN_PIN(remquo)
666
667 extern float __attribute__((overloadable)) SC_rintf(float);
rint(float v)668 float __attribute__((overloadable)) rint(float v) {
669 return SC_rintf(v);
670 }
FN_FUNC_FN(rint)671 FN_FUNC_FN(rint)
672
673 extern float __attribute__((overloadable)) rootn(float v, int r) {
674 if (r == 0) {
675 return posinf();
676 }
677
678 if (iszero(v)) {
679 if (r < 0) {
680 if (r & 1) {
681 return copysign(posinf(), v);
682 } else {
683 return posinf();
684 }
685 } else {
686 if (r & 1) {
687 return copysign(0.f, v);
688 } else {
689 return 0.f;
690 }
691 }
692 }
693
694 if (!isinf(v) && !isnan(v) && (v < 0.f)) {
695 if (r & 1) {
696 return (-1.f * pow(-1.f * v, 1.f / r));
697 } else {
698 return nan(0);
699 }
700 }
701
702 return pow(v, 1.f / r);
703 }
704 FN_FUNC_FN_IN(rootn);
705
706 extern float __attribute__((overloadable)) SC_roundf(float);
round(float v)707 float __attribute__((overloadable)) round(float v) {
708 return SC_roundf(v);
709 }
710 FN_FUNC_FN(round)
711
712 extern float __attribute__((overloadable)) SC_randf2(float, float);
rsRand(float min,float max)713 float __attribute__((overloadable)) rsRand(float min, float max) {
714 return SC_randf2(min, max);
715 }
716
717
rsqrt(float v)718 extern float __attribute__((overloadable)) rsqrt(float v) {
719 return 1.f / sqrt(v);
720 }
721
722 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
723 // These functions must be defined here if we are not using the SSE
724 // implementation, which includes when we are built as part of the
725 // debug runtime (libclcore_debug.bc) or compiling with debug info.
726 #if defined(RS_G_RUNTIME)
727 extern float __attribute__((overloadable)) SC_sqrtf(float);
sqrt(float v)728 float __attribute__((overloadable)) sqrt(float v) {
729 return SC_sqrtf(v);
730 }
731 #endif // defined(RS_G_RUNTIME)
732
733 FN_FUNC_FN(sqrt)
734 #else
735 extern float2 __attribute__((overloadable)) sqrt(float2);
736 extern float3 __attribute__((overloadable)) sqrt(float3);
737 extern float4 __attribute__((overloadable)) sqrt(float4);
738 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
739
740 FN_FUNC_FN(rsqrt)
741
742 extern float __attribute__((overloadable)) SC_sinf(float);
sin(float v)743 float __attribute__((overloadable)) sin(float v) {
744 return SC_sinf(v);
745 }
FN_FUNC_FN(sin)746 FN_FUNC_FN(sin)
747
748 extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
749 *cosptr = cos(v);
750 return sin(v);
751 }
sincos(float2 v,float2 * cosptr)752 extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
753 *cosptr = cos(v);
754 return sin(v);
755 }
sincos(float3 v,float3 * cosptr)756 extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
757 *cosptr = cos(v);
758 return sin(v);
759 }
sincos(float4 v,float4 * cosptr)760 extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
761 *cosptr = cos(v);
762 return sin(v);
763 }
764
765 extern float __attribute__((overloadable)) SC_sinhf(float);
sinh(float v)766 float __attribute__((overloadable)) sinh(float v) {
767 return SC_sinhf(v);
768 }
FN_FUNC_FN(sinh)769 FN_FUNC_FN(sinh)
770
771 extern float __attribute__((overloadable)) sinpi(float v) {
772 return sin(v * M_PI);
773 }
774 FN_FUNC_FN(sinpi)
775
776 extern float __attribute__((overloadable)) SC_tanf(float);
tan(float v)777 float __attribute__((overloadable)) tan(float v) {
778 return SC_tanf(v);
779 }
780 FN_FUNC_FN(tan)
781
782 extern float __attribute__((overloadable)) SC_tanhf(float);
tanh(float v)783 float __attribute__((overloadable)) tanh(float v) {
784 return SC_tanhf(v);
785 }
FN_FUNC_FN(tanh)786 FN_FUNC_FN(tanh)
787
788 extern float __attribute__((overloadable)) tanpi(float v) {
789 return tan(v * M_PI);
790 }
791 FN_FUNC_FN(tanpi)
792
793
794 extern float __attribute__((overloadable)) SC_tgammaf(float);
tgamma(float v)795 float __attribute__((overloadable)) tgamma(float v) {
796 return SC_tgammaf(v);
797 }
798 FN_FUNC_FN(tgamma)
799
800 extern float __attribute__((overloadable)) SC_truncf(float);
trunc(float v)801 float __attribute__((overloadable)) trunc(float v) {
802 return SC_truncf(v);
803 }
FN_FUNC_FN(trunc)804 FN_FUNC_FN(trunc)
805
806 // Int ops (partial), 6.11.3
807
808 #define XN_FUNC_YN(typeout, fnc, typein) \
809 extern typeout __attribute__((overloadable)) fnc(typein); \
810 extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \
811 typeout##2 r; \
812 r.x = fnc(v.x); \
813 r.y = fnc(v.y); \
814 return r; \
815 } \
816 extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \
817 typeout##3 r; \
818 r.x = fnc(v.x); \
819 r.y = fnc(v.y); \
820 r.z = fnc(v.z); \
821 return r; \
822 } \
823 extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \
824 typeout##4 r; \
825 r.x = fnc(v.x); \
826 r.y = fnc(v.y); \
827 r.z = fnc(v.z); \
828 r.w = fnc(v.w); \
829 return r; \
830 }
831
832
833 #define UIN_FUNC_IN(fnc) \
834 XN_FUNC_YN(uchar, fnc, char) \
835 XN_FUNC_YN(ushort, fnc, short) \
836 XN_FUNC_YN(uint, fnc, int)
837
838 #define IN_FUNC_IN(fnc) \
839 XN_FUNC_YN(uchar, fnc, uchar) \
840 XN_FUNC_YN(char, fnc, char) \
841 XN_FUNC_YN(ushort, fnc, ushort) \
842 XN_FUNC_YN(short, fnc, short) \
843 XN_FUNC_YN(uint, fnc, uint) \
844 XN_FUNC_YN(int, fnc, int)
845
846
847 #define XN_FUNC_XN_XN_BODY(type, fnc, body) \
848 extern type __attribute__((overloadable)) \
849 fnc(type v1, type v2) { \
850 return body; \
851 } \
852 extern type##2 __attribute__((overloadable)) \
853 fnc(type##2 v1, type##2 v2) { \
854 type##2 r; \
855 r.x = fnc(v1.x, v2.x); \
856 r.y = fnc(v1.y, v2.y); \
857 return r; \
858 } \
859 extern type##3 __attribute__((overloadable)) \
860 fnc(type##3 v1, type##3 v2) { \
861 type##3 r; \
862 r.x = fnc(v1.x, v2.x); \
863 r.y = fnc(v1.y, v2.y); \
864 r.z = fnc(v1.z, v2.z); \
865 return r; \
866 } \
867 extern type##4 __attribute__((overloadable)) \
868 fnc(type##4 v1, type##4 v2) { \
869 type##4 r; \
870 r.x = fnc(v1.x, v2.x); \
871 r.y = fnc(v1.y, v2.y); \
872 r.z = fnc(v1.z, v2.z); \
873 r.w = fnc(v1.w, v2.w); \
874 return r; \
875 }
876
877 #define IN_FUNC_IN_IN_BODY(fnc, body) \
878 XN_FUNC_XN_XN_BODY(uchar, fnc, body) \
879 XN_FUNC_XN_XN_BODY(char, fnc, body) \
880 XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
881 XN_FUNC_XN_XN_BODY(short, fnc, body) \
882 XN_FUNC_XN_XN_BODY(uint, fnc, body) \
883 XN_FUNC_XN_XN_BODY(int, fnc, body) \
884 XN_FUNC_XN_XN_BODY(float, fnc, body)
885
886
887 /**
888 * abs
889 */
890 extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
891 if (v < 0)
892 return -v;
893 return v;
894 }
abs(int16_t v)895 extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
896 if (v < 0)
897 return -v;
898 return v;
899 }
abs(int8_t v)900 extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
901 if (v < 0)
902 return -v;
903 return v;
904 }
905
906 /**
907 * clz
908 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
909 * expanded to 32 bits. For our smaller data types, we need to subtract off
910 * these unused top bits (that will be always be composed of zeros).
911 */
clz(uint32_t v)912 extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
913 return __builtin_clz(v);
914 }
clz(uint16_t v)915 extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
916 return __builtin_clz(v) - 16;
917 }
clz(uint8_t v)918 extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
919 return __builtin_clz(v) - 24;
920 }
clz(int32_t v)921 extern int32_t __attribute__((overloadable)) clz(int32_t v) {
922 return __builtin_clz(v);
923 }
clz(int16_t v)924 extern int16_t __attribute__((overloadable)) clz(int16_t v) {
925 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
926 }
clz(int8_t v)927 extern int8_t __attribute__((overloadable)) clz(int8_t v) {
928 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
929 }
930
931
932 UIN_FUNC_IN(abs)
IN_FUNC_IN(clz)933 IN_FUNC_IN(clz)
934
935
936 // 6.11.4
937
938
939 extern float __attribute__((overloadable)) degrees(float radians) {
940 return radians * (180.f / M_PI);
941 }
degrees(float2 radians)942 extern float2 __attribute__((overloadable)) degrees(float2 radians) {
943 return radians * (180.f / M_PI);
944 }
degrees(float3 radians)945 extern float3 __attribute__((overloadable)) degrees(float3 radians) {
946 return radians * (180.f / M_PI);
947 }
degrees(float4 radians)948 extern float4 __attribute__((overloadable)) degrees(float4 radians) {
949 return radians * (180.f / M_PI);
950 }
951
mix(float start,float stop,float amount)952 extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
953 return start + (stop - start) * amount;
954 }
mix(float2 start,float2 stop,float2 amount)955 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
956 return start + (stop - start) * amount;
957 }
mix(float3 start,float3 stop,float3 amount)958 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
959 return start + (stop - start) * amount;
960 }
mix(float4 start,float4 stop,float4 amount)961 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
962 return start + (stop - start) * amount;
963 }
mix(float2 start,float2 stop,float amount)964 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
965 return start + (stop - start) * amount;
966 }
mix(float3 start,float3 stop,float amount)967 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
968 return start + (stop - start) * amount;
969 }
mix(float4 start,float4 stop,float amount)970 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
971 return start + (stop - start) * amount;
972 }
973
radians(float degrees)974 extern float __attribute__((overloadable)) radians(float degrees) {
975 return degrees * (M_PI / 180.f);
976 }
radians(float2 degrees)977 extern float2 __attribute__((overloadable)) radians(float2 degrees) {
978 return degrees * (M_PI / 180.f);
979 }
radians(float3 degrees)980 extern float3 __attribute__((overloadable)) radians(float3 degrees) {
981 return degrees * (M_PI / 180.f);
982 }
radians(float4 degrees)983 extern float4 __attribute__((overloadable)) radians(float4 degrees) {
984 return degrees * (M_PI / 180.f);
985 }
986
step(float edge,float v)987 extern float __attribute__((overloadable)) step(float edge, float v) {
988 return (v < edge) ? 0.f : 1.f;
989 }
step(float2 edge,float2 v)990 extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
991 float2 r;
992 r.x = (v.x < edge.x) ? 0.f : 1.f;
993 r.y = (v.y < edge.y) ? 0.f : 1.f;
994 return r;
995 }
step(float3 edge,float3 v)996 extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
997 float3 r;
998 r.x = (v.x < edge.x) ? 0.f : 1.f;
999 r.y = (v.y < edge.y) ? 0.f : 1.f;
1000 r.z = (v.z < edge.z) ? 0.f : 1.f;
1001 return r;
1002 }
step(float4 edge,float4 v)1003 extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
1004 float4 r;
1005 r.x = (v.x < edge.x) ? 0.f : 1.f;
1006 r.y = (v.y < edge.y) ? 0.f : 1.f;
1007 r.z = (v.z < edge.z) ? 0.f : 1.f;
1008 r.w = (v.w < edge.w) ? 0.f : 1.f;
1009 return r;
1010 }
step(float2 edge,float v)1011 extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
1012 float2 r;
1013 r.x = (v < edge.x) ? 0.f : 1.f;
1014 r.y = (v < edge.y) ? 0.f : 1.f;
1015 return r;
1016 }
step(float3 edge,float v)1017 extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
1018 float3 r;
1019 r.x = (v < edge.x) ? 0.f : 1.f;
1020 r.y = (v < edge.y) ? 0.f : 1.f;
1021 r.z = (v < edge.z) ? 0.f : 1.f;
1022 return r;
1023 }
step(float4 edge,float v)1024 extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
1025 float4 r;
1026 r.x = (v < edge.x) ? 0.f : 1.f;
1027 r.y = (v < edge.y) ? 0.f : 1.f;
1028 r.z = (v < edge.z) ? 0.f : 1.f;
1029 r.w = (v < edge.w) ? 0.f : 1.f;
1030 return r;
1031 }
step(float edge,float2 v)1032 extern float2 __attribute__((overloadable)) step(float edge, float2 v) {
1033 float2 r;
1034 r.x = (v.x < edge) ? 0.f : 1.f;
1035 r.y = (v.y < edge) ? 0.f : 1.f;
1036 return r;
1037 }
step(float edge,float3 v)1038 extern float3 __attribute__((overloadable)) step(float edge, float3 v) {
1039 float3 r;
1040 r.x = (v.x < edge) ? 0.f : 1.f;
1041 r.y = (v.y < edge) ? 0.f : 1.f;
1042 r.z = (v.z < edge) ? 0.f : 1.f;
1043 return r;
1044 }
step(float edge,float4 v)1045 extern float4 __attribute__((overloadable)) step(float edge, float4 v) {
1046 float4 r;
1047 r.x = (v.x < edge) ? 0.f : 1.f;
1048 r.y = (v.y < edge) ? 0.f : 1.f;
1049 r.z = (v.z < edge) ? 0.f : 1.f;
1050 r.w = (v.w < edge) ? 0.f : 1.f;
1051 return r;
1052 }
1053
sign(float v)1054 extern float __attribute__((overloadable)) sign(float v) {
1055 if (v > 0) return 1.f;
1056 if (v < 0) return -1.f;
1057 return v;
1058 }
FN_FUNC_FN(sign)1059 FN_FUNC_FN(sign)
1060
1061
1062 // 6.11.5
1063 extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
1064 float3 r;
1065 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1066 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1067 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1068 return r;
1069 }
1070
cross(float4 lhs,float4 rhs)1071 extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
1072 float4 r;
1073 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1074 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1075 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1076 r.w = 0.f;
1077 return r;
1078 }
1079
1080 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
1081 // These functions must be defined here if we are not using the SSE
1082 // implementation, which includes when we are built as part of the
1083 // debug runtime (libclcore_debug.bc) or compiling with debug info.
1084
dot(float lhs,float rhs)1085 extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
1086 return lhs * rhs;
1087 }
dot(float2 lhs,float2 rhs)1088 extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
1089 return lhs.x*rhs.x + lhs.y*rhs.y;
1090 }
dot(float3 lhs,float3 rhs)1091 extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
1092 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
1093 }
dot(float4 lhs,float4 rhs)1094 extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
1095 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
1096 }
1097
length(float v)1098 extern float __attribute__((overloadable)) length(float v) {
1099 return fabs(v);
1100 }
length(float2 v)1101 extern float __attribute__((overloadable)) length(float2 v) {
1102 return sqrt(v.x*v.x + v.y*v.y);
1103 }
length(float3 v)1104 extern float __attribute__((overloadable)) length(float3 v) {
1105 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1106 }
length(float4 v)1107 extern float __attribute__((overloadable)) length(float4 v) {
1108 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1109 }
1110
1111 #else
1112
1113 extern float __attribute__((overloadable)) length(float v);
1114 extern float __attribute__((overloadable)) length(float2 v);
1115 extern float __attribute__((overloadable)) length(float3 v);
1116 extern float __attribute__((overloadable)) length(float4 v);
1117
1118 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
1119
distance(float lhs,float rhs)1120 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
1121 return length(lhs - rhs);
1122 }
distance(float2 lhs,float2 rhs)1123 extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
1124 return length(lhs - rhs);
1125 }
distance(float3 lhs,float3 rhs)1126 extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
1127 return length(lhs - rhs);
1128 }
distance(float4 lhs,float4 rhs)1129 extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
1130 return length(lhs - rhs);
1131 }
1132
1133 /* For the normalization functions, vectors of length 0 should simply be
1134 * returned (i.e. all the components of that vector are 0).
1135 */
normalize(float v)1136 extern float __attribute__((overloadable)) normalize(float v) {
1137 if (v == 0.0f) {
1138 return 0.0f;
1139 } else if (v < 0.0f) {
1140 return -1.0f;
1141 } else {
1142 return 1.0f;
1143 }
1144 }
normalize(float2 v)1145 extern float2 __attribute__((overloadable)) normalize(float2 v) {
1146 float l = length(v);
1147 return l == 0.0f ? v : v / l;
1148 }
normalize(float3 v)1149 extern float3 __attribute__((overloadable)) normalize(float3 v) {
1150 float l = length(v);
1151 return l == 0.0f ? v : v / l;
1152 }
normalize(float4 v)1153 extern float4 __attribute__((overloadable)) normalize(float4 v) {
1154 float l = length(v);
1155 return l == 0.0f ? v : v / l;
1156 }
1157
half_sqrt(float v)1158 extern float __attribute__((overloadable)) half_sqrt(float v) {
1159 return sqrt(v);
1160 }
FN_FUNC_FN(half_sqrt)1161 FN_FUNC_FN(half_sqrt)
1162
1163 extern float __attribute__((overloadable)) fast_length(float v) {
1164 return fabs(v);
1165 }
fast_length(float2 v)1166 extern float __attribute__((overloadable)) fast_length(float2 v) {
1167 return half_sqrt(v.x*v.x + v.y*v.y);
1168 }
fast_length(float3 v)1169 extern float __attribute__((overloadable)) fast_length(float3 v) {
1170 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1171 }
fast_length(float4 v)1172 extern float __attribute__((overloadable)) fast_length(float4 v) {
1173 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1174 }
1175
fast_distance(float lhs,float rhs)1176 extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
1177 return fast_length(lhs - rhs);
1178 }
fast_distance(float2 lhs,float2 rhs)1179 extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
1180 return fast_length(lhs - rhs);
1181 }
fast_distance(float3 lhs,float3 rhs)1182 extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
1183 return fast_length(lhs - rhs);
1184 }
fast_distance(float4 lhs,float4 rhs)1185 extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
1186 return fast_length(lhs - rhs);
1187 }
1188
1189 extern float __attribute__((overloadable)) half_rsqrt(float);
1190
1191 /* For the normalization functions, vectors of length 0 should simply be
1192 * returned (i.e. all the components of that vector are 0).
1193 */
fast_normalize(float v)1194 extern float __attribute__((overloadable)) fast_normalize(float v) {
1195 if (v == 0.0f) {
1196 return 0.0f;
1197 } else if (v < 0.0f) {
1198 return -1.0f;
1199 } else {
1200 return 1.0f;
1201 }
1202 }
1203 // If the length is 0, then rlength should be NaN.
fast_normalize(float2 v)1204 extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
1205 float rlength = half_rsqrt(v.x*v.x + v.y*v.y);
1206 return (rlength == rlength) ? v * rlength : v;
1207 }
fast_normalize(float3 v)1208 extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
1209 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1210 return (rlength == rlength) ? v * rlength : v;
1211 }
fast_normalize(float4 v)1212 extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
1213 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1214 return (rlength == rlength) ? v * rlength : v;
1215 }
1216
half_recip(float v)1217 extern float __attribute__((overloadable)) half_recip(float v) {
1218 return 1.f / v;
1219 }
1220
1221 /*
1222 extern float __attribute__((overloadable)) approx_atan(float x) {
1223 if (x == 0.f)
1224 return 0.f;
1225 if (x < 0.f)
1226 return -1.f * approx_atan(-1.f * x);
1227 if (x > 1.f)
1228 return M_PI_2 - approx_atan(approx_recip(x));
1229 return x * approx_recip(1.f + 0.28f * x*x);
1230 }
1231 FN_FUNC_FN(approx_atan)
1232 */
1233
1234 typedef union
1235 {
1236 float fv;
1237 int32_t iv;
1238 } ieee_float_shape_type;
1239
1240 /* Get a 32 bit int from a float. */
1241
1242 #define GET_FLOAT_WORD(i,d) \
1243 do { \
1244 ieee_float_shape_type gf_u; \
1245 gf_u.fv = (d); \
1246 (i) = gf_u.iv; \
1247 } while (0)
1248
1249 /* Set a float from a 32 bit int. */
1250
1251 #define SET_FLOAT_WORD(d,i) \
1252 do { \
1253 ieee_float_shape_type sf_u; \
1254 sf_u.iv = (i); \
1255 (d) = sf_u.fv; \
1256 } while (0)
1257
1258
1259
1260 // Valid -125 to 125
native_exp2(float v)1261 extern float __attribute__((overloadable)) native_exp2(float v) {
1262 int32_t iv = (int)v;
1263 int32_t x = iv + (iv >> 31); // ~floor(v)
1264 float r = (v - x);
1265
1266 float fo;
1267 SET_FLOAT_WORD(fo, (x + 127) << 23);
1268
1269 r *= 0.694f; // ~ log(e) / log(2)
1270 float r2 = r*r;
1271 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1272 return fo * adj;
1273 }
1274
native_exp2(float2 v)1275 extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
1276 int2 iv = convert_int2(v);
1277 int2 x = iv + (iv >> (int2)31);//floor(v);
1278 float2 r = (v - convert_float2(x));
1279
1280 x += 127;
1281
1282 float2 fo = (float2)(x << (int2)23);
1283
1284 r *= 0.694f; // ~ log(e) / log(2)
1285 float2 r2 = r*r;
1286 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1287 return fo * adj;
1288 }
1289
native_exp2(float4 v)1290 extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
1291 int4 iv = convert_int4(v);
1292 int4 x = iv + (iv >> (int4)31);//floor(v);
1293 float4 r = (v - convert_float4(x));
1294
1295 x += 127;
1296
1297 float4 fo = (float4)(x << (int4)23);
1298
1299 r *= 0.694f; // ~ log(e) / log(2)
1300 float4 r2 = r*r;
1301 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1302 return fo * adj;
1303 }
1304
native_exp2(float3 v)1305 extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
1306 float4 t = 1.f;
1307 t.xyz = v;
1308 return native_exp2(t).xyz;
1309 }
1310
1311
native_exp(float v)1312 extern float __attribute__((overloadable)) native_exp(float v) {
1313 return native_exp2(v * 1.442695041f);
1314 }
native_exp(float2 v)1315 extern float2 __attribute__((overloadable)) native_exp(float2 v) {
1316 return native_exp2(v * 1.442695041f);
1317 }
native_exp(float3 v)1318 extern float3 __attribute__((overloadable)) native_exp(float3 v) {
1319 return native_exp2(v * 1.442695041f);
1320 }
native_exp(float4 v)1321 extern float4 __attribute__((overloadable)) native_exp(float4 v) {
1322 return native_exp2(v * 1.442695041f);
1323 }
1324
native_exp10(float v)1325 extern float __attribute__((overloadable)) native_exp10(float v) {
1326 return native_exp2(v * 3.321928095f);
1327 }
native_exp10(float2 v)1328 extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
1329 return native_exp2(v * 3.321928095f);
1330 }
native_exp10(float3 v)1331 extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
1332 return native_exp2(v * 3.321928095f);
1333 }
native_exp10(float4 v)1334 extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
1335 return native_exp2(v * 3.321928095f);
1336 }
1337
native_log2(float v)1338 extern float __attribute__((overloadable)) native_log2(float v) {
1339 int32_t ibits;
1340 GET_FLOAT_WORD(ibits, v);
1341
1342 int32_t e = (ibits >> 23) & 0xff;
1343
1344 ibits &= 0x7fffff;
1345 ibits |= 127 << 23;
1346
1347 float ir;
1348 SET_FLOAT_WORD(ir, ibits);
1349 ir -= 1.5f;
1350 float ir2 = ir*ir;
1351 float adj2 = (0.405465108f / 0.693147181f) +
1352 ((0.666666667f / 0.693147181f) * ir) -
1353 ((0.222222222f / 0.693147181f) * ir2) +
1354 ((0.098765432f / 0.693147181f) * ir*ir2) -
1355 ((0.049382716f / 0.693147181f) * ir2*ir2) +
1356 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) -
1357 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2);
1358 return (float)(e - 127) + adj2;
1359 }
native_log2(float2 v)1360 extern float2 __attribute__((overloadable)) native_log2(float2 v) {
1361 float2 v2 = {native_log2(v.x), native_log2(v.y)};
1362 return v2;
1363 }
native_log2(float3 v)1364 extern float3 __attribute__((overloadable)) native_log2(float3 v) {
1365 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
1366 return v2;
1367 }
native_log2(float4 v)1368 extern float4 __attribute__((overloadable)) native_log2(float4 v) {
1369 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
1370 return v2;
1371 }
1372
native_log(float v)1373 extern float __attribute__((overloadable)) native_log(float v) {
1374 return native_log2(v) * (1.f / 1.442695041f);
1375 }
native_log(float2 v)1376 extern float2 __attribute__((overloadable)) native_log(float2 v) {
1377 return native_log2(v) * (1.f / 1.442695041f);
1378 }
native_log(float3 v)1379 extern float3 __attribute__((overloadable)) native_log(float3 v) {
1380 return native_log2(v) * (1.f / 1.442695041f);
1381 }
native_log(float4 v)1382 extern float4 __attribute__((overloadable)) native_log(float4 v) {
1383 return native_log2(v) * (1.f / 1.442695041f);
1384 }
1385
native_log10(float v)1386 extern float __attribute__((overloadable)) native_log10(float v) {
1387 return native_log2(v) * (1.f / 3.321928095f);
1388 }
native_log10(float2 v)1389 extern float2 __attribute__((overloadable)) native_log10(float2 v) {
1390 return native_log2(v) * (1.f / 3.321928095f);
1391 }
native_log10(float3 v)1392 extern float3 __attribute__((overloadable)) native_log10(float3 v) {
1393 return native_log2(v) * (1.f / 3.321928095f);
1394 }
native_log10(float4 v)1395 extern float4 __attribute__((overloadable)) native_log10(float4 v) {
1396 return native_log2(v) * (1.f / 3.321928095f);
1397 }
1398
1399
native_powr(float v,float y)1400 extern float __attribute__((overloadable)) native_powr(float v, float y) {
1401 float v2 = native_log2(v);
1402 v2 = fmax(v2 * y, -125.f);
1403 return native_exp2(v2);
1404 }
native_powr(float2 v,float2 y)1405 extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
1406 float2 v2 = native_log2(v);
1407 v2 = fmax(v2 * y, -125.f);
1408 return native_exp2(v2);
1409 }
native_powr(float3 v,float3 y)1410 extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
1411 float3 v2 = native_log2(v);
1412 v2 = fmax(v2 * y, -125.f);
1413 return native_exp2(v2);
1414 }
native_powr(float4 v,float4 y)1415 extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
1416 float4 v2 = native_log2(v);
1417 v2 = fmax(v2 * y, -125.f);
1418 return native_exp2(v2);
1419 }
1420
min(double v1,double v2)1421 extern double __attribute__((overloadable)) min(double v1, double v2) {
1422 return v1 < v2 ? v1 : v2;
1423 }
1424
min(double2 v1,double2 v2)1425 extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) {
1426 double2 r;
1427 r.x = v1.x < v2.x ? v1.x : v2.x;
1428 r.y = v1.y < v2.y ? v1.y : v2.y;
1429 return r;
1430 }
1431
min(double3 v1,double3 v2)1432 extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) {
1433 double3 r;
1434 r.x = v1.x < v2.x ? v1.x : v2.x;
1435 r.y = v1.y < v2.y ? v1.y : v2.y;
1436 r.z = v1.z < v2.z ? v1.z : v2.z;
1437 return r;
1438 }
1439
min(double4 v1,double4 v2)1440 extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) {
1441 double4 r;
1442 r.x = v1.x < v2.x ? v1.x : v2.x;
1443 r.y = v1.y < v2.y ? v1.y : v2.y;
1444 r.z = v1.z < v2.z ? v1.z : v2.z;
1445 r.w = v1.w < v2.w ? v1.w : v2.w;
1446 return r;
1447 }
1448
min(long v1,long v2)1449 extern long __attribute__((overloadable)) min(long v1, long v2) {
1450 return v1 < v2 ? v1 : v2;
1451 }
min(long2 v1,long2 v2)1452 extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
1453 long2 r;
1454 r.x = v1.x < v2.x ? v1.x : v2.x;
1455 r.y = v1.y < v2.y ? v1.y : v2.y;
1456 return r;
1457 }
min(long3 v1,long3 v2)1458 extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
1459 long3 r;
1460 r.x = v1.x < v2.x ? v1.x : v2.x;
1461 r.y = v1.y < v2.y ? v1.y : v2.y;
1462 r.z = v1.z < v2.z ? v1.z : v2.z;
1463 return r;
1464 }
min(long4 v1,long4 v2)1465 extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
1466 long4 r;
1467 r.x = v1.x < v2.x ? v1.x : v2.x;
1468 r.y = v1.y < v2.y ? v1.y : v2.y;
1469 r.z = v1.z < v2.z ? v1.z : v2.z;
1470 r.w = v1.w < v2.w ? v1.w : v2.w;
1471 return r;
1472 }
1473
min(ulong v1,ulong v2)1474 extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
1475 return v1 < v2 ? v1 : v2;
1476 }
min(ulong2 v1,ulong2 v2)1477 extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
1478 ulong2 r;
1479 r.x = v1.x < v2.x ? v1.x : v2.x;
1480 r.y = v1.y < v2.y ? v1.y : v2.y;
1481 return r;
1482 }
min(ulong3 v1,ulong3 v2)1483 extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
1484 ulong3 r;
1485 r.x = v1.x < v2.x ? v1.x : v2.x;
1486 r.y = v1.y < v2.y ? v1.y : v2.y;
1487 r.z = v1.z < v2.z ? v1.z : v2.z;
1488 return r;
1489 }
min(ulong4 v1,ulong4 v2)1490 extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
1491 ulong4 r;
1492 r.x = v1.x < v2.x ? v1.x : v2.x;
1493 r.y = v1.y < v2.y ? v1.y : v2.y;
1494 r.z = v1.z < v2.z ? v1.z : v2.z;
1495 r.w = v1.w < v2.w ? v1.w : v2.w;
1496 return r;
1497 }
1498
max(double v1,double v2)1499 extern double __attribute__((overloadable)) max(double v1, double v2) {
1500 return v1 > v2 ? v1 : v2;
1501 }
1502
max(double2 v1,double2 v2)1503 extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) {
1504 double2 r;
1505 r.x = v1.x > v2.x ? v1.x : v2.x;
1506 r.y = v1.y > v2.y ? v1.y : v2.y;
1507 return r;
1508 }
1509
max(double3 v1,double3 v2)1510 extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) {
1511 double3 r;
1512 r.x = v1.x > v2.x ? v1.x : v2.x;
1513 r.y = v1.y > v2.y ? v1.y : v2.y;
1514 r.z = v1.z > v2.z ? v1.z : v2.z;
1515 return r;
1516 }
1517
max(double4 v1,double4 v2)1518 extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) {
1519 double4 r;
1520 r.x = v1.x > v2.x ? v1.x : v2.x;
1521 r.y = v1.y > v2.y ? v1.y : v2.y;
1522 r.z = v1.z > v2.z ? v1.z : v2.z;
1523 r.w = v1.w > v2.w ? v1.w : v2.w;
1524 return r;
1525 }
1526
max(long v1,long v2)1527 extern long __attribute__((overloadable)) max(long v1, long v2) {
1528 return v1 > v2 ? v1 : v2;
1529 }
max(long2 v1,long2 v2)1530 extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
1531 long2 r;
1532 r.x = v1.x > v2.x ? v1.x : v2.x;
1533 r.y = v1.y > v2.y ? v1.y : v2.y;
1534 return r;
1535 }
max(long3 v1,long3 v2)1536 extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
1537 long3 r;
1538 r.x = v1.x > v2.x ? v1.x : v2.x;
1539 r.y = v1.y > v2.y ? v1.y : v2.y;
1540 r.z = v1.z > v2.z ? v1.z : v2.z;
1541 return r;
1542 }
max(long4 v1,long4 v2)1543 extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
1544 long4 r;
1545 r.x = v1.x > v2.x ? v1.x : v2.x;
1546 r.y = v1.y > v2.y ? v1.y : v2.y;
1547 r.z = v1.z > v2.z ? v1.z : v2.z;
1548 r.w = v1.w > v2.w ? v1.w : v2.w;
1549 return r;
1550 }
1551
max(ulong v1,ulong v2)1552 extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
1553 return v1 > v2 ? v1 : v2;
1554 }
max(ulong2 v1,ulong2 v2)1555 extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
1556 ulong2 r;
1557 r.x = v1.x > v2.x ? v1.x : v2.x;
1558 r.y = v1.y > v2.y ? v1.y : v2.y;
1559 return r;
1560 }
max(ulong3 v1,ulong3 v2)1561 extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
1562 ulong3 r;
1563 r.x = v1.x > v2.x ? v1.x : v2.x;
1564 r.y = v1.y > v2.y ? v1.y : v2.y;
1565 r.z = v1.z > v2.z ? v1.z : v2.z;
1566 return r;
1567 }
max(ulong4 v1,ulong4 v2)1568 extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
1569 ulong4 r;
1570 r.x = v1.x > v2.x ? v1.x : v2.x;
1571 r.y = v1.y > v2.y ? v1.y : v2.y;
1572 r.z = v1.z > v2.z ? v1.z : v2.z;
1573 r.w = v1.w > v2.w ? v1.w : v2.w;
1574 return r;
1575 }
1576
1577 #define THUNK_NATIVE_F(fn) \
1578 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
1579 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
1580 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
1581 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
1582
1583 #define THUNK_NATIVE_F_F(fn) \
1584 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
1585 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
1586 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
1587 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
1588
1589 #define THUNK_NATIVE_F_FP(fn) \
1590 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
1591 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
1592 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
1593 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
1594
1595 #define THUNK_NATIVE_F_I(fn) \
1596 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
1597 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
1598 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
1599 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
1600
1601 THUNK_NATIVE_F(acos)
THUNK_NATIVE_F(acosh)1602 THUNK_NATIVE_F(acosh)
1603 THUNK_NATIVE_F(acospi)
1604 THUNK_NATIVE_F(asin)
1605 THUNK_NATIVE_F(asinh)
1606 THUNK_NATIVE_F(asinpi)
1607 THUNK_NATIVE_F(atan)
1608 THUNK_NATIVE_F_F(atan2)
1609 THUNK_NATIVE_F(atanh)
1610 THUNK_NATIVE_F(atanpi)
1611 THUNK_NATIVE_F_F(atan2pi)
1612 THUNK_NATIVE_F(cbrt)
1613 THUNK_NATIVE_F(cos)
1614 THUNK_NATIVE_F(cosh)
1615 THUNK_NATIVE_F(cospi)
1616 THUNK_NATIVE_F(expm1)
1617 THUNK_NATIVE_F_F(hypot)
1618 THUNK_NATIVE_F(log1p)
1619 THUNK_NATIVE_F_I(rootn)
1620 THUNK_NATIVE_F(rsqrt)
1621 THUNK_NATIVE_F(sqrt)
1622 THUNK_NATIVE_F(sin)
1623 THUNK_NATIVE_F_FP(sincos)
1624 THUNK_NATIVE_F(sinh)
1625 THUNK_NATIVE_F(sinpi)
1626 THUNK_NATIVE_F(tan)
1627 THUNK_NATIVE_F(tanh)
1628 THUNK_NATIVE_F(tanpi)
1629
1630 #undef THUNK_NATIVE_F
1631 #undef THUNK_NATIVE_F_F
1632 #undef THUNK_NATIVE_F_I
1633 #undef THUNK_NATIVE_F_FP
1634
1635 float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
native_normalize(float2 v)1636 float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
native_normalize(float3 v)1637 float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
native_normalize(float4 v)1638 float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
1639
native_distance(float v1,float v2)1640 float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
native_distance(float2 v1,float2 v2)1641 float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
native_distance(float3 v1,float3 v2)1642 float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
native_distance(float4 v1,float4 v2)1643 float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
1644
native_length(float v)1645 float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
native_length(float2 v)1646 float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
native_length(float3 v)1647 float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
native_length(float4 v)1648 float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
1649
native_divide(float v1,float v2)1650 float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
native_divide(float2 v1,float2 v2)1651 float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
native_divide(float3 v1,float3 v2)1652 float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
native_divide(float4 v1,float4 v2)1653 float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
1654
native_recip(float v)1655 float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
native_recip(float2 v)1656 float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
native_recip(float3 v)1657 float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
native_recip(float4 v)1658 float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
1659
1660
1661
1662
1663
1664 #undef FN_FUNC_FN
1665 #undef IN_FUNC_FN
1666 #undef FN_FUNC_FN_FN
1667 #undef FN_FUNC_FN_F
1668 #undef FN_FUNC_FN_IN
1669 #undef FN_FUNC_FN_I
1670 #undef FN_FUNC_FN_PFN
1671 #undef FN_FUNC_FN_PIN
1672 #undef FN_FUNC_FN_FN_FN
1673 #undef FN_FUNC_FN_FN_PIN
1674 #undef XN_FUNC_YN
1675 #undef UIN_FUNC_IN
1676 #undef IN_FUNC_IN
1677 #undef XN_FUNC_XN_XN_BODY
1678 #undef IN_FUNC_IN_IN_BODY
1679
1680 static const unsigned short kHalfPositiveInfinity = 0x7c00;
1681
1682 /* Define f16 functions of the form
1683 * HN output = fn(HN input)
1684 * where HN is scalar or vector half type
1685 */
1686 #define HN_FUNC_HN(fn) \
1687 extern half __attribute__((overloadable)) fn(half h) { \
1688 return (half) fn((float) h); \
1689 } \
1690 extern half2 __attribute__((overloadable)) fn(half2 v) { \
1691 return convert_half2(fn(convert_float2(v))); \
1692 } \
1693 extern half3 __attribute__((overloadable)) fn(half3 v) { \
1694 return convert_half3(fn(convert_float3(v))); \
1695 } \
1696 extern half4 __attribute__((overloadable)) fn(half4 v) { \
1697 return convert_half4(fn(convert_float4(v))); \
1698 }
1699
1700 /* Define f16 functions of the form
1701 * HN output = fn(HN input1, HN input2)
1702 * where HN is scalar or vector half type
1703 */
1704 #define HN_FUNC_HN_HN(fn) \
1705 extern half __attribute__((overloadable)) fn(half h1, half h2) { \
1706 return (half) fn((float) h1, (float) h2); \
1707 } \
1708 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1709 return convert_half2(fn(convert_float2(v1), \
1710 convert_float2(v2))); \
1711 } \
1712 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1713 return convert_half3(fn(convert_float3(v1), \
1714 convert_float3(v2))); \
1715 } \
1716 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1717 return convert_half4(fn(convert_float4(v1), \
1718 convert_float4(v2))); \
1719 }
1720
1721 /* Define f16 functions of the form
1722 * HN output = fn(HN input1, half input2)
1723 * where HN is scalar or vector half type
1724 */
1725 #define HN_FUNC_HN_H(fn) \
1726 extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \
1727 return convert_half2(fn(convert_float2(v1), (float) v2)); \
1728 } \
1729 extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \
1730 return convert_half3(fn(convert_float3(v1), (float) v2)); \
1731 } \
1732 extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \
1733 return convert_half4(fn(convert_float4(v1), (float) v2)); \
1734 }
1735
1736 /* Define f16 functions of the form
1737 * HN output = fn(HN input1, HN input2, HN input3)
1738 * where HN is scalar or vector half type
1739 */
1740 #define HN_FUNC_HN_HN_HN(fn) \
1741 extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \
1742 return (half) fn((float) h1, (float) h2, (float) h3); \
1743 } \
1744 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \
1745 return convert_half2(fn(convert_float2(v1), \
1746 convert_float2(v2), \
1747 convert_float2(v3))); \
1748 } \
1749 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \
1750 return convert_half3(fn(convert_float3(v1), \
1751 convert_float3(v2), \
1752 convert_float3(v3))); \
1753 } \
1754 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \
1755 return convert_half4(fn(convert_float4(v1), \
1756 convert_float4(v2), \
1757 convert_float4(v3))); \
1758 }
1759
1760 /* Define f16 functions of the form
1761 * HN output = fn(HN input1, IN input2)
1762 * where HN is scalar or vector half type and IN the equivalent integer type
1763 * of same vector length.
1764 */
1765 #define HN_FUNC_HN_IN(fn) \
1766 extern half __attribute__((overloadable)) fn(half h1, int v) { \
1767 return (half) fn((float) h1, v); \
1768 } \
1769 extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \
1770 return convert_half2(fn(convert_float2(v1), v2)); \
1771 } \
1772 extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \
1773 return convert_half3(fn(convert_float3(v1), v2)); \
1774 } \
1775 extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \
1776 return convert_half4(fn(convert_float4(v1), v2)); \
1777 }
1778
1779 /* Define f16 functions of the form
1780 * half output = fn(HN input1)
1781 * where HN is a scalar or vector half type.
1782 */
1783 #define H_FUNC_HN(fn) \
1784 extern half __attribute__((overloadable)) fn(half h) { \
1785 return (half) fn((float) h); \
1786 } \
1787 extern half __attribute__((overloadable)) fn(half2 v) { \
1788 return fn(convert_float2(v)); \
1789 } \
1790 extern half __attribute__((overloadable)) fn(half3 v) { \
1791 return fn(convert_float3(v)); \
1792 } \
1793 extern half __attribute__((overloadable)) fn(half4 v) { \
1794 return fn(convert_float4(v)); \
1795 }
1796
1797 /* Define f16 functions of the form
1798 * half output = fn(HN input1, HN input2)
1799 * where HN is a scalar or vector half type.
1800 */
1801 #define H_FUNC_HN_HN(fn) \
1802 extern half __attribute__((overloadable)) fn(half h1, half h2) { \
1803 return (half) fn((float) h1, (float) h2); \
1804 } \
1805 extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1806 return fn(convert_float2(v1), convert_float2(v2)); \
1807 } \
1808 extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1809 return fn(convert_float3(v1), convert_float3(v2)); \
1810 } \
1811 extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1812 return fn(convert_float4(v1), convert_float4(v2)); \
1813 }
1814
1815 #define SCALARIZE_HN_FUNC_HN_PHN(fnc) \
1816 extern half2 __attribute__((overloadable)) fnc(half2 v1, half2 *v2) { \
1817 half2 ret; \
1818 half t[2]; \
1819 ret.x = fnc(v1.x, &t[0]); \
1820 ret.y = fnc(v1.y, &t[1]); \
1821 v2->x = t[0]; \
1822 v2->y = t[1]; \
1823 return ret; \
1824 } \
1825 extern half3 __attribute__((overloadable)) fnc(half3 v1, half3 *v2) { \
1826 half3 ret; \
1827 half t[3]; \
1828 ret.x = fnc(v1.x, &t[0]); \
1829 ret.y = fnc(v1.y, &t[1]); \
1830 ret.z = fnc(v1.z, &t[2]); \
1831 v2->x = t[0]; \
1832 v2->y = t[1]; \
1833 v2->z = t[2]; \
1834 return ret; \
1835 } \
1836 extern half4 __attribute__((overloadable)) fnc(half4 v1, half4 *v2) { \
1837 half4 ret; \
1838 half t[4]; \
1839 ret.x = fnc(v1.x, &t[0]); \
1840 ret.y = fnc(v1.y, &t[1]); \
1841 ret.z = fnc(v1.z, &t[2]); \
1842 ret.w = fnc(v1.w, &t[3]); \
1843 v2->x = t[0]; \
1844 v2->y = t[1]; \
1845 v2->z = t[2]; \
1846 v2->w = t[3]; \
1847 return ret; \
1848 }
1849
1850 /* Define f16 functions of the form
1851 * HN output = fn(HN input1, HN input2)
1852 * where HN is a vector half type. The functions are defined to call the
1853 * scalar function of the same name.
1854 */
1855 #define SCALARIZE_HN_FUNC_HN_HN(fn) \
1856 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1857 half2 ret; \
1858 ret.x = fn(v1.x, v2.x); \
1859 ret.y = fn(v1.y, v2.y); \
1860 return ret; \
1861 } \
1862 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1863 half3 ret; \
1864 ret.x = fn(v1.x, v2.x); \
1865 ret.y = fn(v1.y, v2.y); \
1866 ret.z = fn(v1.z, v2.z); \
1867 return ret; \
1868 } \
1869 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1870 half4 ret; \
1871 ret.x = fn(v1.x, v2.x); \
1872 ret.y = fn(v1.y, v2.y); \
1873 ret.z = fn(v1.z, v2.z); \
1874 ret.w = fn(v1.w, v2.w); \
1875 return ret; \
1876 } \
1877
1878 HN_FUNC_HN(acos);
1879 HN_FUNC_HN(acosh);
1880 HN_FUNC_HN(acospi);
1881 HN_FUNC_HN(asin);
1882 HN_FUNC_HN(asinh);
1883 HN_FUNC_HN(asinpi);
1884 HN_FUNC_HN(atan);
1885 HN_FUNC_HN(atanh);
1886 HN_FUNC_HN(atanpi);
1887 HN_FUNC_HN_HN(atan2);
1888 HN_FUNC_HN_HN(atan2pi);
1889
1890 HN_FUNC_HN(cbrt);
1891 HN_FUNC_HN(ceil);
1892
1893 extern half __attribute__((overloadable)) copysign(half x, half y);
1894 SCALARIZE_HN_FUNC_HN_HN(copysign);
1895
1896 HN_FUNC_HN(cos);
1897 HN_FUNC_HN(cosh);
1898 HN_FUNC_HN(cospi);
1899
cross(half3 lhs,half3 rhs)1900 extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) {
1901 half3 r;
1902 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1903 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1904 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1905 return r;
1906 }
1907
cross(half4 lhs,half4 rhs)1908 extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) {
1909 half4 r;
1910 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1911 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1912 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1913 r.w = 0.f;
1914 return r;
1915 }
1916
1917 HN_FUNC_HN(degrees);
1918 H_FUNC_HN_HN(distance);
1919 H_FUNC_HN_HN(dot);
1920
1921 HN_FUNC_HN(erf);
1922 HN_FUNC_HN(erfc);
1923 HN_FUNC_HN(exp);
1924 HN_FUNC_HN(exp10);
1925 HN_FUNC_HN(exp2);
1926 HN_FUNC_HN(expm1);
1927
1928 HN_FUNC_HN(fabs);
1929 HN_FUNC_HN_HN(fdim);
1930 HN_FUNC_HN(floor);
1931 HN_FUNC_HN_HN_HN(fma);
1932 HN_FUNC_HN_HN(fmax);
1933 HN_FUNC_HN_H(fmax);
1934 HN_FUNC_HN_HN(fmin);
1935 HN_FUNC_HN_H(fmin);
1936 HN_FUNC_HN_HN(fmod);
1937
fract(half v,half * iptr)1938 extern half __attribute__((overloadable)) fract(half v, half *iptr) {
1939 // maxLessThanOne = 0.99951171875, the largest value < 1.0
1940 half maxLessThanOne;
1941 SET_HALF_WORD(maxLessThanOne, 0x3bff);
1942
1943 int i = (int) floor(v);
1944 if (iptr) {
1945 *iptr = i;
1946 }
1947 // return v - floor(v), if strictly less than one
1948 return fmin(v - i, maxLessThanOne);
1949 }
1950
1951 SCALARIZE_HN_FUNC_HN_PHN(fract);
1952
fract(half v)1953 extern half __attribute__((const, overloadable)) fract(half v) {
1954 half unused;
1955 return fract(v, &unused);
1956 }
1957
fract(half2 v)1958 extern half2 __attribute__((const, overloadable)) fract(half2 v) {
1959 half2 unused;
1960 return fract(v, &unused);
1961 }
1962
fract(half3 v)1963 extern half3 __attribute__((const, overloadable)) fract(half3 v) {
1964 half3 unused;
1965 return fract(v, &unused);
1966 }
1967
fract(half4 v)1968 extern half4 __attribute__((const, overloadable)) fract(half4 v) {
1969 half4 unused;
1970 return fract(v, &unused);
1971 }
1972
1973 extern half __attribute__((overloadable)) frexp(half x, int *eptr);
1974
frexp(half2 v1,int2 * eptr)1975 extern half2 __attribute__((overloadable)) frexp(half2 v1, int2 *eptr) {
1976 half2 ret;
1977 int e[2];
1978 ret.x = frexp(v1.x, &e[0]);
1979 ret.y = frexp(v1.y, &e[1]);
1980 eptr->x = e[0];
1981 eptr->y = e[1];
1982 return ret;
1983 }
1984
frexp(half3 v1,int3 * eptr)1985 extern half3 __attribute__((overloadable)) frexp(half3 v1, int3 *eptr) {
1986 half3 ret;
1987 int e[3];
1988 ret.x = frexp(v1.x, &e[0]);
1989 ret.y = frexp(v1.y, &e[1]);
1990 ret.z = frexp(v1.z, &e[2]);
1991 eptr->x = e[0];
1992 eptr->y = e[1];
1993 eptr->z = e[2];
1994 return ret;
1995 }
1996
frexp(half4 v1,int4 * eptr)1997 extern half4 __attribute__((overloadable)) frexp(half4 v1, int4 *eptr) {
1998 half4 ret;
1999 int e[4];
2000 ret.x = frexp(v1.x, &e[0]);
2001 ret.y = frexp(v1.y, &e[1]);
2002 ret.z = frexp(v1.z, &e[2]);
2003 ret.w = frexp(v1.w, &e[3]);
2004 eptr->x = e[0];
2005 eptr->y = e[1];
2006 eptr->z = e[2];
2007 eptr->w = e[3];
2008 return ret;
2009 }
2010
2011 HN_FUNC_HN_HN(hypot);
2012
2013 extern int __attribute__((overloadable)) ilogb(half x);
2014
ilogb(half2 v)2015 extern int2 __attribute__((overloadable)) ilogb(half2 v) {
2016 int2 ret;
2017 ret.x = ilogb(v.x);
2018 ret.y = ilogb(v.y);
2019 return ret;
2020 }
ilogb(half3 v)2021 extern int3 __attribute__((overloadable)) ilogb(half3 v) {
2022 int3 ret;
2023 ret.x = ilogb(v.x);
2024 ret.y = ilogb(v.y);
2025 ret.z = ilogb(v.z);
2026 return ret;
2027 }
ilogb(half4 v)2028 extern int4 __attribute__((overloadable)) ilogb(half4 v) {
2029 int4 ret;
2030 ret.x = ilogb(v.x);
2031 ret.y = ilogb(v.y);
2032 ret.z = ilogb(v.z);
2033 ret.w = ilogb(v.w);
2034 return ret;
2035 }
2036
2037 HN_FUNC_HN_IN(ldexp);
ldexp(half2 v,int exponent)2038 extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) {
2039 return convert_half2(ldexp(convert_float2(v), exponent));
2040 }
ldexp(half3 v,int exponent)2041 extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) {
2042 return convert_half3(ldexp(convert_float3(v), exponent));
2043 }
ldexp(half4 v,int exponent)2044 extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) {
2045 return convert_half4(ldexp(convert_float4(v), exponent));
2046 }
2047
2048 H_FUNC_HN(length);
2049 HN_FUNC_HN(lgamma);
2050
lgamma(half h,int * signp)2051 extern half __attribute__((overloadable)) lgamma(half h, int *signp) {
2052 return (half) lgamma((float) h, signp);
2053 }
lgamma(half2 v,int2 * signp)2054 extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) {
2055 return convert_half2(lgamma(convert_float2(v), signp));
2056 }
lgamma(half3 v,int3 * signp)2057 extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) {
2058 return convert_half3(lgamma(convert_float3(v), signp));
2059 }
lgamma(half4 v,int4 * signp)2060 extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) {
2061 return convert_half4(lgamma(convert_float4(v), signp));
2062 }
2063
2064 HN_FUNC_HN(log);
2065 HN_FUNC_HN(log10);
2066 HN_FUNC_HN(log1p);
2067 HN_FUNC_HN(log2);
2068 HN_FUNC_HN(logb);
2069
2070 HN_FUNC_HN_HN_HN(mad);
2071 HN_FUNC_HN_HN(max);
2072 HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff?
2073 HN_FUNC_HN_HN(min);
2074 HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff?
2075
mix(half start,half stop,half amount)2076 extern half __attribute__((overloadable)) mix(half start, half stop, half amount) {
2077 return start + (stop - start) * amount;
2078 }
mix(half2 start,half2 stop,half2 amount)2079 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) {
2080 return start + (stop - start) * amount;
2081 }
mix(half3 start,half3 stop,half3 amount)2082 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) {
2083 return start + (stop - start) * amount;
2084 }
mix(half4 start,half4 stop,half4 amount)2085 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) {
2086 return start + (stop - start) * amount;
2087 }
mix(half2 start,half2 stop,half amount)2088 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) {
2089 return start + (stop - start) * amount;
2090 }
mix(half3 start,half3 stop,half amount)2091 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) {
2092 return start + (stop - start) * amount;
2093 }
mix(half4 start,half4 stop,half amount)2094 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) {
2095 return start + (stop - start) * amount;
2096 }
2097
2098 extern half __attribute__((overloadable)) modf(half x, half *iptr);
2099 SCALARIZE_HN_FUNC_HN_PHN(modf);
2100
nan_half()2101 half __attribute__((overloadable)) nan_half() {
2102 unsigned short nan_short = kHalfPositiveInfinity | 0x0200;
2103 half nan;
2104 SET_HALF_WORD(nan, nan_short);
2105 return nan;
2106 }
2107
2108 HN_FUNC_HN(normalize);
2109
2110 extern half __attribute__((overloadable)) nextafter(half x, half y);
2111 SCALARIZE_HN_FUNC_HN_HN(nextafter);
2112
2113 HN_FUNC_HN_HN(pow);
2114 HN_FUNC_HN_IN(pown);
2115 HN_FUNC_HN_HN(powr);
2116 HN_FUNC_HN(radians);
2117 HN_FUNC_HN_HN(remainder);
2118
remquo(half n,half d,int * quo)2119 extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) {
2120 return (float) remquo((float) n, (float) d, quo);
2121 }
remquo(half2 n,half2 d,int2 * quo)2122 extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) {
2123 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo));
2124 }
remquo(half3 n,half3 d,int3 * quo)2125 extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) {
2126 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo));
2127 }
remquo(half4 n,half4 d,int4 * quo)2128 extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) {
2129 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo));
2130 }
2131
2132 HN_FUNC_HN(rint);
2133 HN_FUNC_HN_IN(rootn);
2134 HN_FUNC_HN(round);
2135 HN_FUNC_HN(rsqrt);
2136
sign(half h)2137 extern half __attribute__((overloadable)) sign(half h) {
2138 if (h > 0) return (half) 1.f;
2139 if (h < 0) return (half) -1.f;
2140 return h;
2141 }
sign(half2 v)2142 extern half2 __attribute__((overloadable)) sign(half2 v) {
2143 half2 ret;
2144 ret.x = sign(v.x);
2145 ret.y = sign(v.y);
2146 return ret;
2147 }
sign(half3 v)2148 extern half3 __attribute__((overloadable)) sign(half3 v) {
2149 half3 ret;
2150 ret.x = sign(v.x);
2151 ret.y = sign(v.y);
2152 ret.z = sign(v.z);
2153 return ret;
2154 }
sign(half4 v)2155 extern half4 __attribute__((overloadable)) sign(half4 v) {
2156 half4 ret;
2157 ret.x = sign(v.x);
2158 ret.y = sign(v.y);
2159 ret.z = sign(v.z);
2160 ret.w = sign(v.w);
2161 return ret;
2162 }
2163
2164 HN_FUNC_HN(sin);
2165
sincos(half v,half * cosptr)2166 extern half __attribute__((overloadable)) sincos(half v, half *cosptr) {
2167 *cosptr = cos(v);
2168 return sin(v);
2169 }
2170 // TODO verify if LLVM eliminates the duplicate convert_float2
sincos(half2 v,half2 * cosptr)2171 extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) {
2172 *cosptr = cos(v);
2173 return sin(v);
2174 }
sincos(half3 v,half3 * cosptr)2175 extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) {
2176 *cosptr = cos(v);
2177 return sin(v);
2178 }
sincos(half4 v,half4 * cosptr)2179 extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) {
2180 *cosptr = cos(v);
2181 return sin(v);
2182 }
2183
2184 HN_FUNC_HN(sinh);
2185 HN_FUNC_HN(sinpi);
2186 HN_FUNC_HN(sqrt);
2187
step(half edge,half v)2188 extern half __attribute__((overloadable)) step(half edge, half v) {
2189 return (v < edge) ? 0.f : 1.f;
2190 }
step(half2 edge,half2 v)2191 extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) {
2192 half2 r;
2193 r.x = (v.x < edge.x) ? 0.f : 1.f;
2194 r.y = (v.y < edge.y) ? 0.f : 1.f;
2195 return r;
2196 }
step(half3 edge,half3 v)2197 extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) {
2198 half3 r;
2199 r.x = (v.x < edge.x) ? 0.f : 1.f;
2200 r.y = (v.y < edge.y) ? 0.f : 1.f;
2201 r.z = (v.z < edge.z) ? 0.f : 1.f;
2202 return r;
2203 }
step(half4 edge,half4 v)2204 extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) {
2205 half4 r;
2206 r.x = (v.x < edge.x) ? 0.f : 1.f;
2207 r.y = (v.y < edge.y) ? 0.f : 1.f;
2208 r.z = (v.z < edge.z) ? 0.f : 1.f;
2209 r.w = (v.w < edge.w) ? 0.f : 1.f;
2210 return r;
2211 }
step(half2 edge,half v)2212 extern half2 __attribute__((overloadable)) step(half2 edge, half v) {
2213 half2 r;
2214 r.x = (v < edge.x) ? 0.f : 1.f;
2215 r.y = (v < edge.y) ? 0.f : 1.f;
2216 return r;
2217 }
step(half3 edge,half v)2218 extern half3 __attribute__((overloadable)) step(half3 edge, half v) {
2219 half3 r;
2220 r.x = (v < edge.x) ? 0.f : 1.f;
2221 r.y = (v < edge.y) ? 0.f : 1.f;
2222 r.z = (v < edge.z) ? 0.f : 1.f;
2223 return r;
2224 }
step(half4 edge,half v)2225 extern half4 __attribute__((overloadable)) step(half4 edge, half v) {
2226 half4 r;
2227 r.x = (v < edge.x) ? 0.f : 1.f;
2228 r.y = (v < edge.y) ? 0.f : 1.f;
2229 r.z = (v < edge.z) ? 0.f : 1.f;
2230 r.w = (v < edge.w) ? 0.f : 1.f;
2231 return r;
2232 }
step(half edge,half2 v)2233 extern half2 __attribute__((overloadable)) step(half edge, half2 v) {
2234 half2 r;
2235 r.x = (v.x < edge) ? 0.f : 1.f;
2236 r.y = (v.y < edge) ? 0.f : 1.f;
2237 return r;
2238 }
step(half edge,half3 v)2239 extern half3 __attribute__((overloadable)) step(half edge, half3 v) {
2240 half3 r;
2241 r.x = (v.x < edge) ? 0.f : 1.f;
2242 r.y = (v.y < edge) ? 0.f : 1.f;
2243 r.z = (v.z < edge) ? 0.f : 1.f;
2244 return r;
2245 }
step(half edge,half4 v)2246 extern half4 __attribute__((overloadable)) step(half edge, half4 v) {
2247 half4 r;
2248 r.x = (v.x < edge) ? 0.f : 1.f;
2249 r.y = (v.y < edge) ? 0.f : 1.f;
2250 r.z = (v.z < edge) ? 0.f : 1.f;
2251 r.w = (v.w < edge) ? 0.f : 1.f;
2252 return r;
2253 }
2254
2255 HN_FUNC_HN(tan);
2256 HN_FUNC_HN(tanh);
2257 HN_FUNC_HN(tanpi);
2258 HN_FUNC_HN(tgamma);
2259 HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation?
2260
2261 HN_FUNC_HN(native_acos);
2262 HN_FUNC_HN(native_acosh);
2263 HN_FUNC_HN(native_acospi);
2264 HN_FUNC_HN(native_asin);
2265 HN_FUNC_HN(native_asinh);
2266 HN_FUNC_HN(native_asinpi);
2267 HN_FUNC_HN(native_atan);
2268 HN_FUNC_HN(native_atanh);
2269 HN_FUNC_HN(native_atanpi);
2270 HN_FUNC_HN_HN(native_atan2);
2271 HN_FUNC_HN_HN(native_atan2pi);
2272
2273 HN_FUNC_HN(native_cbrt);
2274 HN_FUNC_HN(native_cos);
2275 HN_FUNC_HN(native_cosh);
2276 HN_FUNC_HN(native_cospi);
2277
2278 H_FUNC_HN_HN(native_distance);
2279 HN_FUNC_HN_HN(native_divide);
2280
2281 HN_FUNC_HN(native_exp);
2282 HN_FUNC_HN(native_exp10);
2283 HN_FUNC_HN(native_exp2);
2284 HN_FUNC_HN(native_expm1);
2285
2286 HN_FUNC_HN_HN(native_hypot);
2287 H_FUNC_HN(native_length);
2288
2289 HN_FUNC_HN(native_log);
2290 HN_FUNC_HN(native_log10);
2291 HN_FUNC_HN(native_log1p);
2292 HN_FUNC_HN(native_log2);
2293
2294 HN_FUNC_HN(native_normalize);
2295
2296 HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half?
2297
2298 HN_FUNC_HN(native_recip);
2299 HN_FUNC_HN_IN(native_rootn);
2300 HN_FUNC_HN(native_rsqrt);
2301
2302 HN_FUNC_HN(native_sin);
2303
native_sincos(half v,half * cosptr)2304 extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) {
2305 return sincos(v, cosptr);
2306 }
native_sincos(half2 v,half2 * cosptr)2307 extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) {
2308 return sincos(v, cosptr);
2309 }
native_sincos(half3 v,half3 * cosptr)2310 extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) {
2311 return sincos(v, cosptr);
2312 }
native_sincos(half4 v,half4 * cosptr)2313 extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) {
2314 return sincos(v, cosptr);
2315 }
2316
2317 HN_FUNC_HN(native_sinh);
2318 HN_FUNC_HN(native_sinpi);
2319 HN_FUNC_HN(native_sqrt);
2320
2321 HN_FUNC_HN(native_tan);
2322 HN_FUNC_HN(native_tanh);
2323 HN_FUNC_HN(native_tanpi);
2324
2325 #undef HN_FUNC_HN
2326 #undef HN_FUNC_HN_HN
2327 #undef HN_FUNC_HN_H
2328 #undef HN_FUNC_HN_HN_HN
2329 #undef HN_FUNC_HN_IN
2330 #undef H_FUNC_HN
2331 #undef H_FUNC_HN_HN
2332 #undef SCALARIZE_HN_FUNC_HN_HN
2333
2334 // exports unavailable mathlib functions to compat lib
2335
2336 #ifdef RS_COMPATIBILITY_LIB
2337
2338 // !!! DANGER !!!
2339 // These functions are potentially missing on older Android versions.
2340 // Work around the issue by supplying our own variants.
2341 // !!! DANGER !!!
2342
2343 // The logbl() implementation is taken from the latest bionic/, since
2344 // double == long double on Android.
logbl(long double x)2345 extern "C" long double logbl(long double x) { return logb(x); }
2346
2347 // __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just
2348 // pick the simplest implementation based on the ARM EABI doc.
__aeabi_idiv0(int v)2349 extern "C" int __aeabi_idiv0(int v) { return v; }
2350
2351 #endif // compatibility lib
2352