1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "reference_math.h"
18 #include "harness/compat.h"
19
20 #include <climits>
21
22 #if !defined(_WIN32)
23 #include <cstring>
24 #endif
25
26 #include "utility.h"
27
28 #if defined(__SSE__) \
29 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
30 #include <xmmintrin.h>
31 #endif
32 #if defined(__SSE2__) \
33 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
34 #include <emmintrin.h>
35 #endif
36
37 #ifndef M_PI_4
38 #define M_PI_4 (M_PI / 4)
39 #endif
40
41 #pragma STDC FP_CONTRACT OFF
42 static void __log2_ep(double *hi, double *lo, double x);
43
44 typedef union {
45 uint64_t i;
46 double d;
47 } uint64d_t;
48
49 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
50
51 #define cl_make_nan() _CL_NAN.d
52
reduce1(double x)53 static double reduce1(double x)
54 {
55 if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
56 {
57 if (fabs(x) == INFINITY) return cl_make_nan();
58
59 return 0.0; // we patch up the sign for sinPi and cosPi later, since
60 // they need different signs
61 }
62
63 // Find the nearest multiple of 2
64 const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
65 double z = x + r;
66 z -= r;
67
68 // subtract it from x. Value is now in the range -1 <= x <= 1
69 return x - z;
70 }
71
reference_acospi(double x)72 double reference_acospi(double x) { return reference_acos(x) / M_PI; }
reference_asinpi(double x)73 double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
reference_atanpi(double x)74 double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
reference_atan2pi(double y,double x)75 double reference_atan2pi(double y, double x)
76 {
77 return reference_atan2(y, x) / M_PI;
78 }
reference_cospi(double x)79 double reference_cospi(double x)
80 {
81 if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
82 {
83 if (reference_fabs(x) == INFINITY) return cl_make_nan();
84
85 // Note this probably fails for odd values between 0x1.0p52 and
86 // 0x1.0p53. However, when starting with single precision inputs, there
87 // will be no odd values.
88
89 return 1.0;
90 }
91
92 x = reduce1(x + 0.5);
93
94 // reduce to [-0.5, 0.5]
95 if (x < -0.5)
96 x = -1 - x;
97 else if (x > 0.5)
98 x = 1 - x;
99
100 // cosPi zeros are all +0
101 if (x == 0.0) return 0.0;
102
103 return reference_sin(x * M_PI);
104 }
105
reference_relaxed_cospi(double x)106 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
107
reference_relaxed_divide(double x,double y)108 double reference_relaxed_divide(double x, double y)
109 {
110 return (float)(((float)x) / ((float)y));
111 }
112
reference_divide(double x,double y)113 double reference_divide(double x, double y) { return x / y; }
114
115 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
add_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)116 static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
117 {
118 cl_ulong result = a + b;
119 *carry = result < a;
120 return result;
121 }
122
123 // Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
124 // 0
sub_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)125 static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
126 {
127 cl_ulong result = a - b;
128 *carry = result > a;
129 return result;
130 }
131
fallback_frexpf(float x,int * iptr)132 static float fallback_frexpf(float x, int *iptr)
133 {
134 cl_uint u, v;
135 float fu, fv;
136
137 memcpy(&u, &x, sizeof(u));
138
139 cl_uint exponent = u & 0x7f800000U;
140 cl_uint mantissa = u & ~0x7f800000U;
141
142 // add 1 to the exponent
143 exponent += 0x00800000U;
144
145 if ((cl_int)exponent < (cl_int)0x01000000)
146 { // subnormal, NaN, Inf
147 mantissa |= 0x3f000000U;
148
149 v = mantissa & 0xff800000U;
150 u = mantissa;
151 memcpy(&fv, &v, sizeof(v));
152 memcpy(&fu, &u, sizeof(u));
153
154 fu -= fv;
155
156 memcpy(&v, &fv, sizeof(v));
157 memcpy(&u, &fu, sizeof(u));
158
159 exponent = u & 0x7f800000U;
160 mantissa = u & ~0x7f800000U;
161
162 *iptr = (exponent >> 23) + (-126 + 1 - 126);
163 u = mantissa | 0x3f000000U;
164 memcpy(&fu, &u, sizeof(u));
165 return fu;
166 }
167
168 *iptr = (exponent >> 23) - 127;
169 u = mantissa | 0x3f000000U;
170 memcpy(&fu, &u, sizeof(u));
171 return fu;
172 }
173
extractf(float x,cl_uint * mant)174 static inline int extractf(float x, cl_uint *mant)
175 {
176 static float (*frexppf)(float, int *) = NULL;
177 int e;
178
179 // verify that frexp works properly
180 if (NULL == frexppf)
181 {
182 if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
183 frexppf = frexpf;
184 else
185 frexppf = fallback_frexpf;
186 }
187
188 *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
189 return e - 1;
190 }
191
192 // Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
193 // together and ORd into the LSB of the result
shift_right_sticky_64(cl_ulong * p,int shift)194 static inline void shift_right_sticky_64(cl_ulong *p, int shift)
195 {
196 cl_ulong sticky = 0;
197 cl_ulong r = *p;
198
199 // C doesn't handle shifts greater than the size of the variable dependably
200 if (shift >= 64)
201 {
202 sticky |= (0 != r);
203 r = 0;
204 }
205 else
206 {
207 sticky |= (0 != (r << (64 - shift)));
208 r >>= shift;
209 }
210
211 *p = r | sticky;
212 }
213
214 // Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
215 // into the LSB of the result
add64(cl_ulong * p,cl_ulong c,int * exponent)216 static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
217 {
218 cl_ulong carry;
219 c = add_carry(c, *p, &carry);
220 if (carry)
221 {
222 carry = c & 1; // set aside sticky bit
223 c >>= 1; // right shift to deal with overflow
224 c |= carry
225 | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
226 // latter is to prevent rounding from
227 // believing we are exact half way case
228 *exponent = *exponent + 1; // adjust exponent
229 }
230
231 *p = c;
232 }
233
234 // IEEE-754 round to nearest, ties to even rounding
round_to_nearest_even_float(cl_ulong p,int exponent)235 static float round_to_nearest_even_float(cl_ulong p, int exponent)
236 {
237 union {
238 cl_uint u;
239 cl_float d;
240 } u;
241
242 // If mantissa is zero, return 0.0f
243 if (p == 0) return 0.0f;
244
245 // edges
246 if (exponent > 127)
247 {
248 volatile float r = exponent * CL_FLT_MAX; // signal overflow
249
250 // attempt to fool the compiler into not optimizing the above line away
251 if (r > CL_FLT_MAX) return INFINITY;
252
253 return r;
254 }
255 if (exponent == -150 && p > 0x8000000000000000ULL)
256 return HEX_FLT(+, 1, 0, -, 149);
257 if (exponent <= -150) return 0.0f;
258
259 // Figure out which bits go where
260 int shift = 8 + 32;
261 if (exponent < -126)
262 {
263 shift -= 126 + exponent; // subnormal: shift is not 52
264 exponent = -127; // set exponent to 0
265 }
266 else
267 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
268 // it.
269
270 // Assemble the double (round toward zero)
271 u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
272
273 // put a representation of the residual bits into hi
274 p <<= (64 - shift);
275
276 // round to nearest, ties to even based on the unused portion of p
277 if (p < 0x8000000000000000ULL) return u.d;
278 if (p == 0x8000000000000000ULL)
279 u.u += u.u & 1U;
280 else
281 u.u++;
282
283 return u.d;
284 }
285
round_to_nearest_even_float_ftz(cl_ulong p,int exponent)286 static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
287 {
288 extern int gCheckTininessBeforeRounding;
289
290 union {
291 cl_uint u;
292 cl_float d;
293 } u;
294 int shift = 8 + 32;
295
296 // If mantissa is zero, return 0.0f
297 if (p == 0) return 0.0f;
298
299 // edges
300 if (exponent > 127)
301 {
302 volatile float r = exponent * CL_FLT_MAX; // signal overflow
303
304 // attempt to fool the compiler into not optimizing the above line away
305 if (r > CL_FLT_MAX) return INFINITY;
306
307 return r;
308 }
309
310 // Deal with FTZ for gCheckTininessBeforeRounding
311 if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
312
313 if (exponent
314 == -127) // only happens for machines that check tininess after rounding
315 p = (p & 1) | (p >> 1);
316 else
317 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
318 // it.
319
320 cl_ulong q = p;
321
322
323 // Assemble the double (round toward zero)
324 u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
325
326 // put a representation of the residual bits into hi
327 q <<= (64 - shift);
328
329 // round to nearest, ties to even based on the unused portion of p
330 if (q > 0x8000000000000000ULL)
331 u.u++;
332 else if (q == 0x8000000000000000ULL)
333 u.u += u.u & 1U;
334
335 // Deal with FTZ for ! gCheckTininessBeforeRounding
336 if (0 == (u.u & 0x7f800000U)) return 0.0f;
337
338 return u.d;
339 }
340
341
342 // IEEE-754 round toward zero.
round_toward_zero_float(cl_ulong p,int exponent)343 static float round_toward_zero_float(cl_ulong p, int exponent)
344 {
345 union {
346 cl_uint u;
347 cl_float d;
348 } u;
349
350 // If mantissa is zero, return 0.0f
351 if (p == 0) return 0.0f;
352
353 // edges
354 if (exponent > 127)
355 {
356 volatile float r = exponent * CL_FLT_MAX; // signal overflow
357
358 // attempt to fool the compiler into not optimizing the above line away
359 if (r > CL_FLT_MAX) return CL_FLT_MAX;
360
361 return r;
362 }
363
364 if (exponent <= -149) return 0.0f;
365
366 // Figure out which bits go where
367 int shift = 8 + 32;
368 if (exponent < -126)
369 {
370 shift -= 126 + exponent; // subnormal: shift is not 52
371 exponent = -127; // set exponent to 0
372 }
373 else
374 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
375 // it.
376
377 // Assemble the double (round toward zero)
378 u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
379
380 return u.d;
381 }
382
round_toward_zero_float_ftz(cl_ulong p,int exponent)383 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
384 {
385 union {
386 cl_uint u;
387 cl_float d;
388 } u;
389 int shift = 8 + 32;
390
391 // If mantissa is zero, return 0.0f
392 if (p == 0) return 0.0f;
393
394 // edges
395 if (exponent > 127)
396 {
397 volatile float r = exponent * CL_FLT_MAX; // signal overflow
398
399 // attempt to fool the compiler into not optimizing the above line away
400 if (r > CL_FLT_MAX) return CL_FLT_MAX;
401
402 return r;
403 }
404
405 // Deal with FTZ for gCheckTininessBeforeRounding
406 if (exponent < -126) return 0.0f;
407
408 cl_ulong q = p &=
409 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
410
411 // Assemble the double (round toward zero)
412 u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
413
414 // put a representation of the residual bits into hi
415 q <<= (64 - shift);
416
417 return u.d;
418 }
419
420 // Subtract two significands.
sub64(cl_ulong * c,cl_ulong p,cl_uint * signC,int * expC)421 static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
422 {
423 cl_ulong carry;
424 p = sub_carry(*c, p, &carry);
425
426 if (carry)
427 {
428 *signC ^= 0x80000000U;
429 p = -p;
430 }
431
432 // normalize
433 if (p)
434 {
435 int shift = 32;
436 cl_ulong test = 1ULL << 32;
437 while (0 == (p & 0x8000000000000000ULL))
438 {
439 if (p < test)
440 {
441 p <<= shift;
442 *expC = *expC - shift;
443 }
444 shift >>= 1;
445 test <<= shift;
446 }
447 }
448 else
449 {
450 // zero result.
451 *expC = -200;
452 *signC =
453 0; // IEEE rules say a - a = +0 for all rounding modes except -inf
454 }
455
456 *c = p;
457 }
458
459
reference_fma(float a,float b,float c,int shouldFlush)460 float reference_fma(float a, float b, float c, int shouldFlush)
461 {
462 static const cl_uint kMSB = 0x80000000U;
463
464 // Make bits accessible
465 union {
466 cl_uint u;
467 cl_float d;
468 } ua;
469 ua.d = a;
470 union {
471 cl_uint u;
472 cl_float d;
473 } ub;
474 ub.d = b;
475 union {
476 cl_uint u;
477 cl_float d;
478 } uc;
479 uc.d = c;
480
481 // deal with Nans, infinities and zeros
482 if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
483 || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
484 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
485 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
486 {
487 FPU_mode_type oldMode;
488 RoundingMode oldRoundMode = kRoundToNearestEven;
489 if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
490
491 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
492
493 memset(&oldMode, 0, sizeof(oldMode));
494 if (shouldFlush) ForceFTZ(&oldMode);
495
496 a = (float)reference_multiply(
497 a, b); // some risk that the compiler will insert a non-compliant
498 // fma here on some platforms.
499 a = (float)reference_add(
500 a,
501 c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
502
503 if (shouldFlush) RestoreFPState(&oldMode);
504
505 if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
506 return a;
507 }
508
509 // extract exponent and mantissa
510 // exponent is a standard unbiased signed integer
511 // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
512 cl_uint mantA, mantB, mantC;
513 int expA = extractf(a, &mantA);
514 int expB = extractf(b, &mantB);
515 int expC = extractf(c, &mantC);
516 cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
517 // if we are adding or subtracting
518
519 // exact product of A and B
520 int exponent = expA + expB;
521 cl_uint sign = (ua.u ^ ub.u) & kMSB;
522 cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
523
524 // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
525 // The MSB might not be set. If so, fix that. Otherwise, reflect the fact
526 // that we got another power of two from the multiplication
527 if (0 == (0x8000000000000000ULL & product))
528 product <<= 1;
529 else
530 exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
531 // exponent increased.
532
533 // infinite precision add
534 cl_ulong addend = (cl_ulong)mantC << 32;
535 if (exponent >= expC)
536 {
537 // Shift C relative to the product so that their exponents match
538 if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
539
540 // Add
541 if (sign ^ signC)
542 sub64(&product, addend, &sign, &exponent);
543 else
544 add64(&product, addend, &exponent);
545 }
546 else
547 {
548 // Shift the product relative to C so that their exponents match
549 shift_right_sticky_64(&product, expC - exponent);
550
551 // add
552 if (sign ^ signC)
553 sub64(&addend, product, &signC, &expC);
554 else
555 add64(&addend, product, &expC);
556
557 product = addend;
558 exponent = expC;
559 sign = signC;
560 }
561
562 // round to IEEE result -- we do not do flushing to zero here. That part is
563 // handled manually in ternary.c.
564 if (gIsInRTZMode)
565 {
566 if (shouldFlush)
567 ua.d = round_toward_zero_float_ftz(product, exponent);
568 else
569 ua.d = round_toward_zero_float(product, exponent);
570 }
571 else
572 {
573 if (shouldFlush)
574 ua.d = round_to_nearest_even_float_ftz(product, exponent);
575 else
576 ua.d = round_to_nearest_even_float(product, exponent);
577 }
578
579 // Set the sign
580 ua.u |= sign;
581
582 return ua.d;
583 }
584
reference_relaxed_exp10(double x)585 double reference_relaxed_exp10(double x) { return reference_exp10(x); }
586
reference_exp10(double x)587 double reference_exp10(double x)
588 {
589 return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
590 }
591
592
reference_ilogb(double x)593 int reference_ilogb(double x)
594 {
595 extern int gDeviceILogb0, gDeviceILogbNaN;
596 union {
597 cl_double f;
598 cl_ulong u;
599 } u;
600
601 u.f = (float)x;
602 cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
603 if (exponent == 0x7ff)
604 {
605 if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
606
607 return CL_INT_MAX;
608 }
609
610 if (exponent == 0)
611 { // deal with denormals
612 u.f = x * HEX_DBL(+, 1, 0, +, 64);
613 exponent = (cl_int)(u.u >> 52) & 0x7ff;
614 if (exponent == 0) return gDeviceILogb0;
615
616 return exponent - (1023 + 64);
617 }
618
619 return exponent - 1023;
620 }
621
reference_nan(cl_uint x)622 double reference_nan(cl_uint x)
623 {
624 union {
625 cl_uint u;
626 cl_float f;
627 } u;
628 u.u = x | 0x7fc00000U;
629 return (double)u.f;
630 }
631
reference_maxmag(double x,double y)632 double reference_maxmag(double x, double y)
633 {
634 double fabsx = fabs(x);
635 double fabsy = fabs(y);
636
637 if (fabsx < fabsy) return y;
638
639 if (fabsy < fabsx) return x;
640
641 return reference_fmax(x, y);
642 }
643
reference_minmag(double x,double y)644 double reference_minmag(double x, double y)
645 {
646 double fabsx = fabs(x);
647 double fabsy = fabs(y);
648
649 if (fabsx > fabsy) return y;
650
651 if (fabsy > fabsx) return x;
652
653 return reference_fmin(x, y);
654 }
655
reference_relaxed_mad(double a,double b,double c)656 double reference_relaxed_mad(double a, double b, double c)
657 {
658 return ((float)a) * ((float)b) + (float)c;
659 }
660
reference_mad(double a,double b,double c)661 double reference_mad(double a, double b, double c) { return a * b + c; }
662
reference_recip(double x)663 double reference_recip(double x) { return 1.0 / x; }
reference_rootn(double x,int i)664 double reference_rootn(double x, int i)
665 {
666
667 // rootn ( x, 0 ) returns a NaN.
668 if (0 == i) return cl_make_nan();
669
670 // rootn ( x, n ) returns a NaN for x < 0 and n is even.
671 if (x < 0 && 0 == (i & 1)) return cl_make_nan();
672
673 if (x == 0.0)
674 {
675 switch (i & 0x80000001)
676 {
677 // rootn ( +-0, n ) is +0 for even n > 0.
678 case 0: return 0.0f;
679
680 // rootn ( +-0, n ) is +-0 for odd n > 0.
681 case 1: return x;
682
683 // rootn ( +-0, n ) is +inf for even n < 0.
684 case 0x80000000: return INFINITY;
685
686 // rootn ( +-0, n ) is +-inf for odd n < 0.
687 case 0x80000001: return copysign(INFINITY, x);
688 }
689 }
690
691 double sign = x;
692 x = reference_fabs(x);
693 x = reference_exp2(reference_log2(x) / (double)i);
694 return reference_copysignd(x, sign);
695 }
696
reference_rsqrt(double x)697 double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
698
reference_sinpi(double x)699 double reference_sinpi(double x)
700 {
701 double r = reduce1(x);
702
703 // reduce to [-0.5, 0.5]
704 if (r < -0.5)
705 r = -1 - r;
706 else if (r > 0.5)
707 r = 1 - r;
708
709 // sinPi zeros have the same sign as x
710 if (r == 0.0) return reference_copysignd(0.0, x);
711
712 return reference_sin(r * M_PI);
713 }
714
reference_relaxed_sinpi(double x)715 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
716
reference_tanpi(double x)717 double reference_tanpi(double x)
718 {
719 // set aside the sign (allows us to preserve sign of -0)
720 double sign = reference_copysignd(1.0, x);
721 double z = reference_fabs(x);
722
723 // if big and even -- caution: only works if x only has single precision
724 if (z >= HEX_DBL(+, 1, 0, +, 24))
725 {
726 if (z == INFINITY) return x - x; // nan
727
728 return reference_copysignd(
729 0.0, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n.
730 }
731
732 // reduce to the range [ -0.5, 0.5 ]
733 double nearest = reference_rint(z); // round to nearest even places n + 0.5
734 // values in the right place for us
735 int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
736 z -= nearest;
737
738 // correction for odd integer x for the right sign of zero
739 if ((i & 1) && z == 0.0) sign = -sign;
740
741 // track changes to the sign
742 sign *= reference_copysignd(1.0, z); // really should just be an xor
743 z = reference_fabs(z); // remove the sign again
744
745 // reduce once more
746 // If we don't do this, rounding error in z * M_PI will cause us not to
747 // return infinities properly
748 if (z > 0.25)
749 {
750 z = 0.5 - z;
751 return sign
752 / reference_tan(z * M_PI); // use system tan to get the right result
753 }
754
755 //
756 return sign
757 * reference_tan(z * M_PI); // use system tan to get the right result
758 }
759
reference_pown(double x,int i)760 double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
reference_powr(double x,double y)761 double reference_powr(double x, double y)
762 {
763 // powr ( x, y ) returns NaN for x < 0.
764 if (x < 0.0) return cl_make_nan();
765
766 // powr ( x, NaN ) returns the NaN for x >= 0.
767 // powr ( NaN, y ) returns the NaN.
768 if (isnan(x) || isnan(y))
769 return x + y; // Note: behavior different here than for pow(1,NaN),
770 // pow(NaN, 0)
771
772 if (x == 1.0)
773 {
774 // powr ( +1, +-inf ) returns NaN.
775 if (reference_fabs(y) == INFINITY) return cl_make_nan();
776
777 // powr ( +1, y ) is 1 for finite y. (NaN handled above)
778 return 1.0;
779 }
780
781 if (y == 0.0)
782 {
783 // powr ( +inf, +-0 ) returns NaN.
784 // powr ( +-0, +-0 ) returns NaN.
785 if (x == 0.0 || x == INFINITY) return cl_make_nan();
786
787 // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already
788 // handled above)
789 return 1.0;
790 }
791
792 if (x == 0.0)
793 {
794 // powr ( +-0, -inf) is +inf.
795 // powr ( +-0, y ) is +inf for finite y < 0.
796 if (y < 0.0) return INFINITY;
797
798 // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above)
799 return 0.0;
800 }
801
802 // x = +inf
803 if (isinf(x))
804 {
805 if (y < 0) return 0;
806 return INFINITY;
807 }
808
809 double fabsx = reference_fabs(x);
810 double fabsy = reference_fabs(y);
811
812 // y = +-inf cases
813 if (isinf(fabsy))
814 {
815 if (y < 0)
816 {
817 if (fabsx < 1) return INFINITY;
818 return 0;
819 }
820 if (fabsx < 1) return 0;
821 return INFINITY;
822 }
823
824 double hi, lo;
825 __log2_ep(&hi, &lo, x);
826 double prod = y * hi;
827 double result = reference_exp2(prod);
828
829 return result;
830 }
831
reference_fract(double x,double * ip)832 double reference_fract(double x, double *ip)
833 {
834 if (isnan(x))
835 {
836 *ip = cl_make_nan();
837 return cl_make_nan();
838 }
839
840 float i;
841 float f = modff((float)x, &i);
842 if (f < 0.0)
843 {
844 f = 1.0f + f;
845 i -= 1.0f;
846 if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
847 }
848 *ip = i;
849 return f;
850 }
851
852
reference_add(double x,double y)853 double reference_add(double x, double y)
854 {
855 volatile float a = (float)x;
856 volatile float b = (float)y;
857
858 #if defined(__SSE__) \
859 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
860 // defeat x87
861 __m128 va = _mm_set_ss((float)a);
862 __m128 vb = _mm_set_ss((float)b);
863 va = _mm_add_ss(va, vb);
864 _mm_store_ss((float *)&a, va);
865 #elif defined(__PPC__)
866 // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
867 // denorm's to zero. As such, the reference add with FTZ must be emulated in
868 // sw.
869 if (fpu_control & _FPU_MASK_NI)
870 {
871 union {
872 cl_uint u;
873 cl_float d;
874 } ua;
875 ua.d = a;
876 union {
877 cl_uint u;
878 cl_float d;
879 } ub;
880 ub.d = b;
881 cl_uint mantA, mantB;
882 cl_ulong addendA, addendB, sum;
883 int expA = extractf(a, &mantA);
884 int expB = extractf(b, &mantB);
885 cl_uint signA = ua.u & 0x80000000U;
886 cl_uint signB = ub.u & 0x80000000U;
887
888 // Force matching exponents if an operand is 0
889 if (a == 0.0f)
890 {
891 expA = expB;
892 }
893 else if (b == 0.0f)
894 {
895 expB = expA;
896 }
897
898 addendA = (cl_ulong)mantA << 32;
899 addendB = (cl_ulong)mantB << 32;
900
901 if (expA >= expB)
902 {
903 // Shift B relative to the A so that their exponents match
904 if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
905
906 // add
907 if (signA ^ signB)
908 sub64(&addendA, addendB, &signA, &expA);
909 else
910 add64(&addendA, addendB, &expA);
911 }
912 else
913 {
914 // Shift the A relative to B so that their exponents match
915 shift_right_sticky_64(&addendA, expB - expA);
916
917 // add
918 if (signA ^ signB)
919 sub64(&addendB, addendA, &signB, &expB);
920 else
921 add64(&addendB, addendA, &expB);
922
923 addendA = addendB;
924 expA = expB;
925 signA = signB;
926 }
927
928 // round to IEEE result
929 if (gIsInRTZMode)
930 {
931 ua.d = round_toward_zero_float_ftz(addendA, expA);
932 }
933 else
934 {
935 ua.d = round_to_nearest_even_float_ftz(addendA, expA);
936 }
937 // Set the sign
938 ua.u |= signA;
939 a = ua.d;
940 }
941 else
942 {
943 a += b;
944 }
945 #else
946 a += b;
947 #endif
948 return (double)a;
949 }
950
951
reference_subtract(double x,double y)952 double reference_subtract(double x, double y)
953 {
954 volatile float a = (float)x;
955 volatile float b = (float)y;
956 #if defined(__SSE__) \
957 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
958 // defeat x87
959 __m128 va = _mm_set_ss((float)a);
960 __m128 vb = _mm_set_ss((float)b);
961 va = _mm_sub_ss(va, vb);
962 _mm_store_ss((float *)&a, va);
963 #else
964 a -= b;
965 #endif
966 return a;
967 }
968
reference_multiply(double x,double y)969 double reference_multiply(double x, double y)
970 {
971 volatile float a = (float)x;
972 volatile float b = (float)y;
973 #if defined(__SSE__) \
974 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
975 // defeat x87
976 __m128 va = _mm_set_ss((float)a);
977 __m128 vb = _mm_set_ss((float)b);
978 va = _mm_mul_ss(va, vb);
979 _mm_store_ss((float *)&a, va);
980 #elif defined(__PPC__)
981 // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
982 // denorm's to zero. As such, the reference multiply with FTZ must be
983 // emulated in sw.
984 if (fpu_control & _FPU_MASK_NI)
985 {
986 // extract exponent and mantissa
987 // exponent is a standard unbiased signed integer
988 // mantissa is a cl_uint, with leading non-zero bit positioned at the
989 // MSB
990 union {
991 cl_uint u;
992 cl_float d;
993 } ua;
994 ua.d = a;
995 union {
996 cl_uint u;
997 cl_float d;
998 } ub;
999 ub.d = b;
1000 cl_uint mantA, mantB;
1001 int expA = extractf(a, &mantA);
1002 int expB = extractf(b, &mantB);
1003
1004 // exact product of A and B
1005 int exponent = expA + expB;
1006 cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
1007 cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
1008
1009 // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
1010 // The MSB might not be set. If so, fix that. Otherwise, reflect the
1011 // fact that we got another power of two from the multiplication
1012 if (0 == (0x8000000000000000ULL & product))
1013 product <<= 1;
1014 else
1015 exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
1016 // our exponent increased.
1017
1018 // round to IEEE result -- we do not do flushing to zero here. That part
1019 // is handled manually in ternary.c.
1020 if (gIsInRTZMode)
1021 {
1022 ua.d = round_toward_zero_float_ftz(product, exponent);
1023 }
1024 else
1025 {
1026 ua.d = round_to_nearest_even_float_ftz(product, exponent);
1027 }
1028 // Set the sign
1029 ua.u |= sign;
1030 a = ua.d;
1031 }
1032 else
1033 {
1034 a *= b;
1035 }
1036 #else
1037 a *= b;
1038 #endif
1039 return a;
1040 }
1041
reference_lgamma_r(double x,int * signp)1042 double reference_lgamma_r(double x, int *signp)
1043 {
1044 // This is not currently tested
1045 *signp = 0;
1046 return x;
1047 }
1048
1049
reference_isequal(double x,double y)1050 int reference_isequal(double x, double y) { return x == y; }
reference_isfinite(double x)1051 int reference_isfinite(double x) { return 0 != isfinite(x); }
reference_isgreater(double x,double y)1052 int reference_isgreater(double x, double y) { return x > y; }
reference_isgreaterequal(double x,double y)1053 int reference_isgreaterequal(double x, double y) { return x >= y; }
reference_isinf(double x)1054 int reference_isinf(double x) { return 0 != isinf(x); }
reference_isless(double x,double y)1055 int reference_isless(double x, double y) { return x < y; }
reference_islessequal(double x,double y)1056 int reference_islessequal(double x, double y) { return x <= y; }
reference_islessgreater(double x,double y)1057 int reference_islessgreater(double x, double y)
1058 {
1059 return 0 != islessgreater(x, y);
1060 }
reference_isnan(double x)1061 int reference_isnan(double x) { return 0 != isnan(x); }
reference_isnormal(double x)1062 int reference_isnormal(double x) { return 0 != isnormal((float)x); }
reference_isnotequal(double x,double y)1063 int reference_isnotequal(double x, double y) { return x != y; }
reference_isordered(double x,double y)1064 int reference_isordered(double x, double y) { return x == x && y == y; }
reference_isunordered(double x,double y)1065 int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
reference_signbit(float x)1066 int reference_signbit(float x) { return 0 != signbit(x); }
1067
1068 #if 1 // defined( _MSC_VER )
1069
1070 // Missing functions for win32
1071
1072
reference_copysign(float x,float y)1073 float reference_copysign(float x, float y)
1074 {
1075 union {
1076 float f;
1077 cl_uint u;
1078 } ux, uy;
1079 ux.f = x;
1080 uy.f = y;
1081 ux.u &= 0x7fffffffU;
1082 ux.u |= uy.u & 0x80000000U;
1083 return ux.f;
1084 }
1085
1086
reference_copysignd(double x,double y)1087 double reference_copysignd(double x, double y)
1088 {
1089 union {
1090 double f;
1091 cl_ulong u;
1092 } ux, uy;
1093 ux.f = x;
1094 uy.f = y;
1095 ux.u &= 0x7fffffffffffffffULL;
1096 ux.u |= uy.u & 0x8000000000000000ULL;
1097 return ux.f;
1098 }
1099
1100
reference_round(double x)1101 double reference_round(double x)
1102 {
1103 double absx = reference_fabs(x);
1104 if (absx < 0.5) return reference_copysignd(0.0, x);
1105
1106 if (absx < HEX_DBL(+, 1, 0, +, 53))
1107 x = reference_trunc(x + reference_copysignd(0.5, x));
1108
1109 return x;
1110 }
1111
reference_trunc(double x)1112 double reference_trunc(double x)
1113 {
1114 if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
1115 {
1116 cl_long l = (cl_long)x;
1117
1118 return reference_copysignd((double)l, x);
1119 }
1120
1121 return x;
1122 }
1123
1124 #ifndef FP_ILOGB0
1125 #define FP_ILOGB0 INT_MIN
1126 #endif
1127
1128 #ifndef FP_ILOGBNAN
1129 #define FP_ILOGBNAN INT_MAX
1130 #endif
1131
1132
reference_cbrt(double x)1133 double reference_cbrt(double x)
1134 {
1135 return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
1136 }
1137
reference_rint(double x)1138 double reference_rint(double x)
1139 {
1140 if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
1141 {
1142 double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
1143 double rounded = (x + magic) - magic;
1144 x = reference_copysignd(rounded, x);
1145 }
1146
1147 return x;
1148 }
1149
reference_acosh(double x)1150 double reference_acosh(double x)
1151 { // not full precision. Sufficient precision to cover float
1152 if (isnan(x)) return x + x;
1153
1154 if (x < 1.0) return cl_make_nan();
1155
1156 return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
1157 }
1158
reference_asinh(double x)1159 double reference_asinh(double x)
1160 {
1161 /*
1162 * ====================================================
1163 * This function is from fdlibm: http://www.netlib.org
1164 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1165 *
1166 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1167 * Permission to use, copy, modify, and distribute this
1168 * software is freely granted, provided that this notice
1169 * is preserved.
1170 * ====================================================
1171 */
1172 if (isnan(x) || isinf(x)) return x + x;
1173
1174 double absx = reference_fabs(x);
1175 if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
1176
1177 double sign = reference_copysignd(1.0, x);
1178
1179 if (absx > HEX_DBL(+, 1, 0, +, 28))
1180 return sign
1181 * (reference_log(absx)
1182 + 0.693147180559945309417232121458176568); // log(2)
1183
1184 if (absx > 2.0)
1185 return sign
1186 * reference_log(2.0 * absx
1187 + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
1188
1189 return sign
1190 * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
1191 }
1192
1193
reference_atanh(double x)1194 double reference_atanh(double x)
1195 {
1196 /*
1197 * ====================================================
1198 * This function is from fdlibm: http://www.netlib.org
1199 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1200 *
1201 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1202 * Permission to use, copy, modify, and distribute this
1203 * software is freely granted, provided that this notice
1204 * is preserved.
1205 * ====================================================
1206 */
1207 if (isnan(x)) return x + x;
1208
1209 double signed_half = reference_copysignd(0.5, x);
1210 x = reference_fabs(x);
1211 if (x > 1.0) return cl_make_nan();
1212
1213 if (x < 0.5)
1214 return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
1215
1216 return signed_half * reference_log1p(2.0 * x / (1 - x));
1217 }
1218
reference_relaxed_atan(double x)1219 double reference_relaxed_atan(double x) { return reference_atan(x); }
1220
reference_relaxed_exp2(double x)1221 double reference_relaxed_exp2(double x) { return reference_exp2(x); }
1222
reference_exp2(double x)1223 double reference_exp2(double x)
1224 { // Note: only suitable for verifying single precision. Doesn't have range of a
1225 // full double exp2 implementation.
1226 if (x == 0.0) return 1.0;
1227
1228 // separate x into fractional and integer parts
1229 double i = reference_rint(x); // round to nearest integer
1230
1231 if (i < -150) return 0.0;
1232
1233 if (i > 129) return INFINITY;
1234
1235 double f = x - i; // -0.5 <= f <= 0.5
1236
1237 // find exp2(f)
1238 // calculate as p(f) = (exp2(f)-1)/f
1239 // exp2(f) = f * p(f) + 1
1240 // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
1241
1242 double p = 0.693147180560184539289
1243 + (0.240226506955902863183
1244 + (0.055504108656833424373
1245 + (0.009618129212846484796
1246 + (0.001333355902958566035
1247 + (0.000154034191902497930
1248 + (0.000015252317761038105
1249 + (0.000001326283129417092
1250 + 0.000000102593187638680 * f)
1251 * f)
1252 * f)
1253 * f)
1254 * f)
1255 * f)
1256 * f)
1257 * f;
1258 f *= p;
1259 f += 1.0;
1260
1261 // scale by 2 ** i
1262 union {
1263 cl_ulong u;
1264 double d;
1265 } u;
1266 int exponent = (int)i + 1023;
1267 u.u = (cl_ulong)exponent << 52;
1268
1269 return f * u.d;
1270 }
1271
1272
reference_expm1(double x)1273 double reference_expm1(double x)
1274 { // Note: only suitable for verifying single precision. Doesn't have range of a
1275 // full double expm1 implementation. It is only accurate to 47 bits or less.
1276
1277 // early out for small numbers and NaNs
1278 if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
1279
1280 // early out for large negative numbers
1281 if (x < -130.0) return -1.0;
1282
1283 // early out for large positive numbers
1284 if (x > 100.0) return INFINITY;
1285
1286 // separate x into fractional and integer parts
1287 double i = reference_rint(x); // round to nearest integer
1288 double f = x - i; // -0.5 <= f <= 0.5
1289
1290 // reduce f to the range -0.0625 .. f.. 0.0625
1291 int index = (int)(f * 16.0) + 8; // 0...16
1292
1293 static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125,
1294 -0.25, -0.1875, -0.125, -0.0625,
1295 0.0, +0.0625, +0.125, +0.1875,
1296 +0.25, +0.3125, +0.375, +0.4375,
1297 +0.5 };
1298
1299
1300 // exponentials[i] = expm1(reduction[i])
1301 static const double exponentials[17] = {
1302 HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
1303 HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
1304 HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
1305 HEX_DBL(-, 1, 12d35a41ba104, -, 2),
1306 HEX_DBL(-, 1, c5041854df7d4, -, 3),
1307 HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
1308 HEX_DBL(-, 1, e14aed893eef4, -, 4),
1309 HEX_DBL(-, 1, f0540438fd5c3, -, 5),
1310 HEX_DBL(+, 0, 0, +, 0),
1311 HEX_DBL(+, 1, 082b577d34ed8, -, 4),
1312 HEX_DBL(+, 1, 10b022db7ae68, -, 3),
1313 HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
1314 HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
1315 HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
1316 HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
1317 HEX_DBL(+, 1, 190048ef6002, -, 1),
1318 HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
1319 };
1320
1321
1322 f -= reduction[index];
1323
1324 // find expm1(f)
1325 // calculate as p(f) = (exp(f)-1)/f
1326 // expm1(f) = f * p(f)
1327 // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
1328 // the range +- 0.0625
1329 double p = 0.999999999999998001599
1330 + (0.499999999999839628284
1331 + (0.166666666672817459505
1332 + (0.041666666612283048687
1333 + (0.008333330214567431435
1334 + (0.001389005319303770070 + 0.000198833381525156667 * f)
1335 * f)
1336 * f)
1337 * f)
1338 * f)
1339 * f;
1340 f *= p; // expm1( reduced f )
1341
1342 // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
1343 // = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
1344 // exponentials[index] + 1 -1 = exmp1( reduced_f) *
1345 // exponentials[index] + exmp1( reduced_f) + exponentials[index]
1346 f += exponentials[index] + f * exponentials[index];
1347
1348 // scale by e ** i
1349 int exponent = (int)i;
1350 if (0 == exponent) return f; // precise answer for x near 1
1351
1352 // table of e**(i-150)
1353 static const double exp_table[128 + 150 + 1] = {
1354 HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
1355 HEX_DBL(+, 1, 06e9996332ba1, -, 215),
1356 HEX_DBL(+, 1, 6555cb289e44b, -, 214),
1357 HEX_DBL(+, 1, e5ab364643354, -, 213),
1358 HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
1359 HEX_DBL(+, 1, c094499cc578e, -, 210),
1360 HEX_DBL(+, 1, 30d759323998c, -, 208),
1361 HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
1362 HEX_DBL(+, 1, 198fa3f30be25, -, 205),
1363 HEX_DBL(+, 1, 7eae636d6144e, -, 204),
1364 HEX_DBL(+, 1, 040f1036f4863, -, 202),
1365 HEX_DBL(+, 1, 6174e477a895f, -, 201),
1366 HEX_DBL(+, 1, e065b82dd95a, -, 200),
1367 HEX_DBL(+, 1, 4676be491d129, -, 198),
1368 HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
1369 HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
1370 HEX_DBL(+, 1, 99d3397ab8371, -, 194),
1371 HEX_DBL(+, 1, 1681497ed15b3, -, 192),
1372 HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
1373 HEX_DBL(+, 1, 013c74edba307, -, 189),
1374 HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
1375 HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
1376 HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
1377 HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
1378 HEX_DBL(+, 1, 2a42764857b19, -, 182),
1379 HEX_DBL(+, 1, 9560792d19314, -, 181),
1380 HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
1381 HEX_DBL(+, 1, 766b45dd84f18, -, 178),
1382 HEX_DBL(+, 1, fce362fe6e7d, -, 177),
1383 HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
1384 HEX_DBL(+, 1, d606847fc727a, -, 174),
1385 HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
1386 HEX_DBL(+, 1, b2216c6efdac1, -, 171),
1387 HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
1388 HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
1389 HEX_DBL(+, 1, 107df698da211, -, 166),
1390 HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
1391 HEX_DBL(+, 1, f75d6040aeff6, -, 164),
1392 HEX_DBL(+, 1, 56126259e093c, -, 162),
1393 HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
1394 HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
1395 HEX_DBL(+, 1, ad6b22f55db42, -, 158),
1396 HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
1397 HEX_DBL(+, 1, 8c9feab89b876, -, 155),
1398 HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
1399 HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
1400 HEX_DBL(+, 1, f1e6b68529e33, -, 151),
1401 HEX_DBL(+, 1, 525be4e4e601d, -, 149),
1402 HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
1403 HEX_DBL(+, 1, 3884e838aea68, -, 146),
1404 HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
1405 HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
1406 HEX_DBL(+, 1, 8851d84118908, -, 142),
1407 HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
1408 HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
1409 HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
1410 HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
1411 HEX_DBL(+, 1, c6e2d05bbc, -, 135),
1412 HEX_DBL(+, 1, 35208867c2683, -, 133),
1413 HEX_DBL(+, 1, a425b317eeacd, -, 132),
1414 HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
1415 HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
1416 HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
1417 HEX_DBL(+, 1, 666d0dad2961d, -, 126),
1418 HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
1419 HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
1420 HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
1421 HEX_DBL(+, 1, 31c5957a47de2, -, 120),
1422 HEX_DBL(+, 1, 9f96445648b9f, -, 119),
1423 HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
1424 HEX_DBL(+, 1, 7fd974d372e45, -, 116),
1425 HEX_DBL(+, 1, 04da4d1452919, -, 114),
1426 HEX_DBL(+, 1, 62891f06b345, -, 113),
1427 HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
1428 HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
1429 HEX_DBL(+, 1, bd109d9d94bda, -, 109),
1430 HEX_DBL(+, 1, 2e73f53fba844, -, 107),
1431 HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
1432 HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
1433 HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
1434 HEX_DBL(+, 1, 02057d1245ceb, -, 101),
1435 HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
1436 HEX_DBL(+, 1, dca23bae16424, -, 99),
1437 HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
1438 HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
1439 HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
1440 HEX_DBL(+, 1, 969d47321e4cc, -, 93),
1441 HEX_DBL(+, 1, 1452b7723aed2, -, 91),
1442 HEX_DBL(+, 1, 778fe2497184c, -, 90),
1443 HEX_DBL(+, 1, fe7116182e9cc, -, 89),
1444 HEX_DBL(+, 1, 5ae191a99585a, -, 87),
1445 HEX_DBL(+, 1, d775d87da854d, -, 86),
1446 HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
1447 HEX_DBL(+, 1, b374b315f87c1, -, 83),
1448 HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
1449 HEX_DBL(+, 1, 923372c67a074, -, 80),
1450 HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
1451 HEX_DBL(+, 1, 737c5645114b5, -, 77),
1452 HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
1453 HEX_DBL(+, 1, 571db733a9d61, -, 74),
1454 HEX_DBL(+, 1, d257d547e083f, -, 73),
1455 HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
1456 HEX_DBL(+, 1, aebabae3a41b5, -, 70),
1457 HEX_DBL(+, 1, 24b6031b49bda, -, 68),
1458 HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
1459 HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
1460 HEX_DBL(+, 1, 6f741de1748ec, -, 64),
1461 HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
1462 HEX_DBL(+, 1, 536452ee2f75c, -, 61),
1463 HEX_DBL(+, 1, cd480a1b7482, -, 60),
1464 HEX_DBL(+, 1, 39792499b1a24, -, 58),
1465 HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
1466 HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
1467 HEX_DBL(+, 1, 898471fca6055, -, 54),
1468 HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
1469 HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
1470 HEX_DBL(+, 1, ee001eed62aa, -, 50),
1471 HEX_DBL(+, 1, 4fb547c775da8, -, 48),
1472 HEX_DBL(+, 1, c8464f7616468, -, 47),
1473 HEX_DBL(+, 1, 36121e24d3bba, -, 45),
1474 HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
1475 HEX_DBL(+, 1, 1e642baeb84a, -, 42),
1476 HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
1477 HEX_DBL(+, 1, 0885298767e9a, -, 39),
1478 HEX_DBL(+, 1, 67852a7007e42, -, 38),
1479 HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
1480 HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
1481 HEX_DBL(+, 1, c3527e433fab1, -, 34),
1482 HEX_DBL(+, 1, 32b48bf117da2, -, 32),
1483 HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
1484 HEX_DBL(+, 1, 1b48655f37267, -, 29),
1485 HEX_DBL(+, 1, 81056ff2c5772, -, 28),
1486 HEX_DBL(+, 1, 05a628c699fa1, -, 26),
1487 HEX_DBL(+, 1, 639e3175a689d, -, 25),
1488 HEX_DBL(+, 1, e355bbaee85cb, -, 24),
1489 HEX_DBL(+, 1, 4875ca227ec38, -, 22),
1490 HEX_DBL(+, 1, be6c6fdb01612, -, 21),
1491 HEX_DBL(+, 1, 2f6053b981d98, -, 19),
1492 HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
1493 HEX_DBL(+, 1, 18354238f6764, -, 16),
1494 HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
1495 HEX_DBL(+, 1, 02cf22526545a, -, 13),
1496 HEX_DBL(+, 1, 5fc21041027ad, -, 12),
1497 HEX_DBL(+, 1, de16b9c24a98f, -, 11),
1498 HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
1499 HEX_DBL(+, 1, b993fe00d5376, -, 8),
1500 HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
1501 HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
1502 HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
1503 HEX_DBL(+, 1, 78b56362cef38, -, 2),
1504 HEX_DBL(+, 1, 0, +, 0),
1505 HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
1506 HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
1507 HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
1508 HEX_DBL(+, 1, b4c902e273a58, +, 5),
1509 HEX_DBL(+, 1, 28d389970338f, +, 7),
1510 HEX_DBL(+, 1, 936dc5690c08f, +, 8),
1511 HEX_DBL(+, 1, 122885aaeddaa, +, 10),
1512 HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
1513 HEX_DBL(+, 1, fa7157c470f82, +, 12),
1514 HEX_DBL(+, 1, 5829dcf95056, +, 14),
1515 HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
1516 HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
1517 HEX_DBL(+, 1, b00b5916ac955, +, 18),
1518 HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
1519 HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
1520 HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
1521 HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
1522 HEX_DBL(+, 1, f4f22091940bd, +, 25),
1523 HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
1524 HEX_DBL(+, 1, ceb088b68e804, +, 28),
1525 HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
1526 HEX_DBL(+, 1, ab5adb9c436, +, 31),
1527 HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
1528 HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
1529 HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
1530 HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
1531 HEX_DBL(+, 1, ef822f7f6731d, +, 38),
1532 HEX_DBL(+, 1, 50bba3796379a, +, 40),
1533 HEX_DBL(+, 1, c9aae4631c056, +, 41),
1534 HEX_DBL(+, 1, 370470aec28ed, +, 43),
1535 HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
1536 HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
1537 HEX_DBL(+, 1, 866f34a725782, +, 47),
1538 HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
1539 HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
1540 HEX_DBL(+, 1, ea215a1d20d76, +, 51),
1541 HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
1542 HEX_DBL(+, 1, c4b334617cc67, +, 54),
1543 HEX_DBL(+, 1, 33a43d282a519, +, 56),
1544 HEX_DBL(+, 1, a220d397972eb, +, 57),
1545 HEX_DBL(+, 1, 1c25c88df6862, +, 59),
1546 HEX_DBL(+, 1, 8232558201159, +, 60),
1547 HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
1548 HEX_DBL(+, 1, 64b41c6d37832, +, 63),
1549 HEX_DBL(+, 1, e4cf766fe49be, +, 64),
1550 HEX_DBL(+, 1, 49767bc0483e3, +, 66),
1551 HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
1552 HEX_DBL(+, 1, 304d6aeca254b, +, 69),
1553 HEX_DBL(+, 1, 9d97010884251, +, 70),
1554 HEX_DBL(+, 1, 19103e4080b45, +, 72),
1555 HEX_DBL(+, 1, 7e013cd114461, +, 73),
1556 HEX_DBL(+, 1, 03996528e074c, +, 75),
1557 HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
1558 HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
1559 HEX_DBL(+, 1, 45e3076d61699, +, 79),
1560 HEX_DBL(+, 1, baed16a6e0da7, +, 80),
1561 HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
1562 HEX_DBL(+, 1, 9919cabefcb69, +, 83),
1563 HEX_DBL(+, 1, 160345c9953e3, +, 85),
1564 HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
1565 HEX_DBL(+, 1, 00c810d464097, +, 88),
1566 HEX_DBL(+, 1, 5d009394c5c27, +, 89),
1567 HEX_DBL(+, 1, da57de8f107a8, +, 90),
1568 HEX_DBL(+, 1, 425982cf597cd, +, 92),
1569 HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
1570 HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
1571 HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
1572 HEX_DBL(+, 1, 12fec759586fd, +, 98),
1573 HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
1574 HEX_DBL(+, 1, fbfd219c43b04, +, 100),
1575 HEX_DBL(+, 1, 5936d44e1a146, +, 102),
1576 HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
1577 HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
1578 HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
1579 HEX_DBL(+, 1, 268038c2c0e, +, 108),
1580 HEX_DBL(+, 1, 9044a73545d48, +, 109),
1581 HEX_DBL(+, 1, 1002ab6218b38, +, 111),
1582 HEX_DBL(+, 1, 71b3540cbf921, +, 112),
1583 HEX_DBL(+, 1, f6799ea9c414a, +, 113),
1584 HEX_DBL(+, 1, 55779b984f3eb, +, 115),
1585 HEX_DBL(+, 1, d01a210c44aa4, +, 116),
1586 HEX_DBL(+, 1, 3b63da8e9121, +, 118),
1587 HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
1588 HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
1589 HEX_DBL(+, 1, 8bec7503ca477, +, 122),
1590 HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
1591 HEX_DBL(+, 1, 6db0118477245, +, 125),
1592 HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
1593 HEX_DBL(+, 1, 51c2cc3433801, +, 128),
1594 HEX_DBL(+, 1, cb108ffbec164, +, 129),
1595 HEX_DBL(+, 1, 37f780991b584, +, 131),
1596 HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
1597 HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
1598 HEX_DBL(+, 1, 87a0553328015, +, 135),
1599 HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
1600 HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
1601 HEX_DBL(+, 1, eba064644060a, +, 139),
1602 HEX_DBL(+, 1, 4e184933d9364, +, 141),
1603 HEX_DBL(+, 1, c614fe2531841, +, 142),
1604 HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
1605 HEX_DBL(+, 1, a36798b9d969b, +, 145),
1606 HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
1607 HEX_DBL(+, 1, 836026385c974, +, 148),
1608 HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
1609 HEX_DBL(+, 1, 65cae0969f286, +, 151),
1610 HEX_DBL(+, 1, e64a58639cae8, +, 152),
1611 HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
1612 HEX_DBL(+, 1, c12744a3a28e3, +, 155),
1613 HEX_DBL(+, 1, 313b3b6978e85, +, 157),
1614 HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
1615 HEX_DBL(+, 1, 19ebe56b56453, +, 160),
1616 HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
1617 HEX_DBL(+, 1, 04644610df2ff, +, 163),
1618 HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
1619 HEX_DBL(+, 1, e103201f299b3, +, 165),
1620 HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
1621 HEX_DBL(+, 1, bc473cfede104, +, 168),
1622 HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
1623 HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
1624 HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
1625 HEX_DBL(+, 1, 7b03166942309, +, 174),
1626 HEX_DBL(+, 1, 0190be03150a7, +, 176),
1627 HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
1628 HEX_DBL(+, 1, dbca9263f8487, +, 178),
1629 HEX_DBL(+, 1, 43556dee93bee, +, 180),
1630 HEX_DBL(+, 1, b774c12967dfa, +, 181),
1631 HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
1632 HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
1633 };
1634
1635 // scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 =
1636 // e**i
1637 return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
1638 }
1639
1640
reference_fmax(double x,double y)1641 double reference_fmax(double x, double y)
1642 {
1643 if (isnan(y)) return x;
1644
1645 return x >= y ? x : y;
1646 }
1647
reference_fmin(double x,double y)1648 double reference_fmin(double x, double y)
1649 {
1650 if (isnan(y)) return x;
1651
1652 return x <= y ? x : y;
1653 }
1654
reference_hypot(double x,double y)1655 double reference_hypot(double x, double y)
1656 {
1657 // Since the inputs are actually floats, we don't have to worry about range
1658 // here
1659 if (isinf(x) || isinf(y)) return INFINITY;
1660
1661 return sqrt(x * x + y * y);
1662 }
1663
reference_ilogbl(long double x)1664 int reference_ilogbl(long double x)
1665 {
1666 extern int gDeviceILogb0, gDeviceILogbNaN;
1667
1668 // Since we are just using this to verify double precision, we can
1669 // use the double precision ilogb here
1670 union {
1671 double f;
1672 cl_ulong u;
1673 } u;
1674 u.f = (double)x;
1675
1676 int exponent = (int)(u.u >> 52) & 0x7ff;
1677 if (exponent == 0x7ff)
1678 {
1679 if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
1680
1681 return CL_INT_MAX;
1682 }
1683
1684 if (exponent == 0)
1685 { // deal with denormals
1686 u.f = x * HEX_DBL(+, 1, 0, +, 64);
1687 exponent = (cl_uint)(u.u >> 52) & 0x7ff;
1688 if (exponent == 0) return gDeviceILogb0;
1689
1690 exponent -= 1023 + 64;
1691 return exponent;
1692 }
1693
1694 return exponent - 1023;
1695 }
1696
reference_relaxed_log2(double x)1697 double reference_relaxed_log2(double x) { return reference_log2(x); }
1698
reference_log2(double x)1699 double reference_log2(double x)
1700 {
1701 if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
1702
1703 if (x == 0.0f) return -INFINITY;
1704
1705 if (x == INFINITY) return INFINITY;
1706
1707 double hi, lo;
1708 __log2_ep(&hi, &lo, x);
1709 return hi;
1710 }
1711
reference_log1p(double x)1712 double reference_log1p(double x)
1713 { // This function is suitable only for verifying log1pf(). It produces several
1714 // double precision ulps of error.
1715
1716 // Handle small and NaN
1717 if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
1718
1719 // deal with special values
1720 if (x <= -1.0)
1721 {
1722 if (x < -1.0) return cl_make_nan();
1723 return -INFINITY;
1724 }
1725
1726 // infinity
1727 if (x == INFINITY) return INFINITY;
1728
1729 // High precision result for when near 0, to avoid problems with the
1730 // reference result falling in the wrong binade.
1731 if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
1732
1733 // Our polynomial is only good in the region +-2**-4.
1734 // If we aren't in that range then we need to reduce to be in that range
1735 double correctionLo =
1736 -0.0; // correction down stream to compensate for the reduction, if any
1737 double correctionHi =
1738 -0.0; // correction down stream to compensate for the exponent, if any
1739 if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
1740 {
1741 x += 1.0; // double should cover any loss of precision here
1742
1743 // separate x into (1+f) * 2**i
1744 union {
1745 double d;
1746 cl_ulong u;
1747 } u;
1748 u.d = x;
1749 int i = (int)((u.u >> 52) & 0x7ff) - 1023;
1750 u.u &= 0x000fffffffffffffULL;
1751 int index = (int)(u.u >> 48);
1752 u.u |= 0x3ff0000000000000ULL;
1753 double f = u.d;
1754
1755 // further reduce f to be within 1/16 of 1.0
1756 static const double scale_table[16] = {
1757 1.0,
1758 HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
1759 HEX_DBL(+, 1, b8e38e42737a1, -, 1),
1760 HEX_DBL(+, 1, a1af28711adf3, -, 1),
1761 HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
1762 HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
1763 HEX_DBL(+, 1, 68ba2e94df404, -, 1),
1764 HEX_DBL(+, 1, 590b216defb29, -, 1),
1765 HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
1766 HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
1767 HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
1768 HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
1769 HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
1770 HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
1771 HEX_DBL(+, 1, 0888888e46fea, -, 1),
1772 HEX_DBL(+, 1, 00000038e9862, -, 1)
1773 };
1774
1775 // correction_table[i] = -log( scale_table[i] )
1776 // All entries have >= 64 bits of precision (rather than the expected
1777 // 53)
1778 static const double correction_table[16] = {
1779 -0.0,
1780 HEX_DBL(+, 1, 7a5c722c16058, -, 4),
1781 HEX_DBL(+, 1, 323db16c89ab1, -, 3),
1782 HEX_DBL(+, 1, a0f87d180629, -, 3),
1783 HEX_DBL(+, 1, 050279324e17c, -, 2),
1784 HEX_DBL(+, 1, 36f885bb270b0, -, 2),
1785 HEX_DBL(+, 1, 669b771b5cc69, -, 2),
1786 HEX_DBL(+, 1, 94203a6292a05, -, 2),
1787 HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
1788 HEX_DBL(+, 1, e982376ddb80e, -, 2),
1789 HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
1790 HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
1791 HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
1792 HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
1793 HEX_DBL(+, 1, 521a5c0330615, -, 1),
1794 HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
1795 };
1796
1797 f *= scale_table[index];
1798 correctionLo = correction_table[index];
1799
1800 // log( 2**(i) ) = i * log(2)
1801 correctionHi = (double)i * 0.693147180559945309417232121458176568;
1802
1803 x = f - 1.0;
1804 }
1805
1806
1807 // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
1808 // [-1/16, 1/16]
1809 // max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
1810 double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
1811 + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
1812 + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
1813 + (HEX_DBL(-, 1, 00000062c78, -, 2)
1814 + (HEX_DBL(+, 1, 9999958a3321, -, 3)
1815 + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
1816 + (HEX_DBL(+, 1, 24957208391a5, -, 3)
1817 + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
1818 + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
1819 * x)
1820 * x)
1821 * x)
1822 * x)
1823 * x)
1824 * x)
1825 * x;
1826
1827 // log(x+1) = x * p(x) + x
1828 x += x * p;
1829
1830 return correctionHi + (correctionLo + x);
1831 }
1832
reference_logb(double x)1833 double reference_logb(double x)
1834 {
1835 union {
1836 float f;
1837 cl_uint u;
1838 } u;
1839 u.f = (float)x;
1840
1841 cl_int exponent = (u.u >> 23) & 0xff;
1842 if (exponent == 0xff) return x * x;
1843
1844 if (exponent == 0)
1845 { // deal with denormals
1846 u.u = (u.u & 0x007fffff) | 0x3f800000;
1847 u.f -= 1.0f;
1848 exponent = (u.u >> 23) & 0xff;
1849 if (exponent == 0) return -INFINITY;
1850
1851 return exponent - (127 + 126);
1852 }
1853
1854 return exponent - 127;
1855 }
1856
reference_relaxed_reciprocal(double x)1857 double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
1858
reference_reciprocal(double x)1859 double reference_reciprocal(double x) { return 1.0 / x; }
1860
reference_remainder(double x,double y)1861 double reference_remainder(double x, double y)
1862 {
1863 int i;
1864 return reference_remquo(x, y, &i);
1865 }
1866
reference_lgamma(double x)1867 double reference_lgamma(double x)
1868 {
1869 /*
1870 * ====================================================
1871 * This function is from fdlibm. http://www.netlib.org
1872 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1873 *
1874 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1875 * Permission to use, copy, modify, and distribute this
1876 * software is freely granted, provided that this notice
1877 * is preserved.
1878 * ====================================================
1879 *
1880 */
1881
1882 static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
1883 // 0x00000000 */
1884 half = 5.00000000000000000000e-01, /* 0x3FE00000,
1885 0x00000000 */
1886 one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
1887 pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
1888 a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
1889 a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
1890 a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
1891 a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
1892 a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
1893 a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
1894 a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
1895 a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
1896 a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
1897 a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
1898 a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
1899 a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
1900 tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
1901 tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
1902 /* tt = -(tail of tf) */
1903 tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
1904 t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
1905 t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
1906 t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
1907 t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
1908 t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
1909 t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
1910 t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
1911 t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
1912 t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
1913 t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
1914 t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
1915 t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
1916 t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
1917 t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
1918 t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
1919 u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1920 u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
1921 u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
1922 u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
1923 u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
1924 u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
1925 v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
1926 v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
1927 v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
1928 v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
1929 v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
1930 s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1931 s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
1932 s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
1933 s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
1934 s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
1935 s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
1936 s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
1937 r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
1938 r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
1939 r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
1940 r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
1941 r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
1942 r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
1943 w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
1944 w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
1945 w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
1946 w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
1947 w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
1948 w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
1949 w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
1950
1951 static const double zero = 0.00000000000000000000e+00;
1952 double t, y, z, nadj, p, p1, p2, p3, q, r, w;
1953 cl_int i, hx, lx, ix;
1954
1955 union {
1956 double d;
1957 cl_ulong u;
1958 } u;
1959 u.d = x;
1960
1961 hx = (cl_int)(u.u >> 32);
1962 lx = (cl_int)(u.u & 0xffffffffULL);
1963
1964 /* purge off +-inf, NaN, +-0, and negative arguments */
1965 // *signgamp = 1;
1966 ix = hx & 0x7fffffff;
1967 if (ix >= 0x7ff00000) return x * x;
1968 if ((ix | lx) == 0) return INFINITY;
1969 if (ix < 0x3b900000)
1970 { /* |x|<2**-70, return -log(|x|) */
1971 if (hx < 0)
1972 {
1973 // *signgamp = -1;
1974 return -reference_log(-x);
1975 }
1976 else
1977 return -reference_log(x);
1978 }
1979 if (hx < 0)
1980 {
1981 if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
1982 return INFINITY;
1983 t = reference_sinpi(x);
1984 if (t == zero) return INFINITY; /* -integer */
1985 nadj = reference_log(pi / reference_fabs(t * x));
1986 // if(t<zero) *signgamp = -1;
1987 x = -x;
1988 }
1989
1990 /* purge off 1 and 2 */
1991 if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
1992 r = 0;
1993 /* for x < 2.0 */
1994 else if (ix < 0x40000000)
1995 {
1996 if (ix <= 0x3feccccc)
1997 { /* lgamma(x) = lgamma(x+1)-log(x) */
1998 r = -reference_log(x);
1999 if (ix >= 0x3FE76944)
2000 {
2001 y = 1.0 - x;
2002 i = 0;
2003 }
2004 else if (ix >= 0x3FCDA661)
2005 {
2006 y = x - (tc - one);
2007 i = 1;
2008 }
2009 else
2010 {
2011 y = x;
2012 i = 2;
2013 }
2014 }
2015 else
2016 {
2017 r = zero;
2018 if (ix >= 0x3FFBB4C3)
2019 {
2020 y = 2.0 - x;
2021 i = 0;
2022 } /* [1.7316,2] */
2023 else if (ix >= 0x3FF3B4C4)
2024 {
2025 y = x - tc;
2026 i = 1;
2027 } /* [1.23,1.73] */
2028 else
2029 {
2030 y = x - one;
2031 i = 2;
2032 }
2033 }
2034 switch (i)
2035 {
2036 case 0:
2037 z = y * y;
2038 p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
2039 p2 = z
2040 * (a1
2041 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
2042 p = y * p1 + p2;
2043 r += (p - 0.5 * y);
2044 break;
2045 case 1:
2046 z = y * y;
2047 w = z * y;
2048 p1 = t0
2049 + w
2050 * (t3
2051 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
2052 p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
2053 p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
2054 p = z * p1 - (tt - w * (p2 + y * p3));
2055 r += (tf + p);
2056 break;
2057 case 2:
2058 p1 = y
2059 * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
2060 p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
2061 r += (-0.5 * y + p1 / p2);
2062 }
2063 }
2064 else if (ix < 0x40200000)
2065 { /* x < 8.0 */
2066 i = (int)x;
2067 t = zero;
2068 y = x - (double)i;
2069 p = y
2070 * (s0
2071 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
2072 q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
2073 r = half * y + p / q;
2074 z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
2075 switch (i)
2076 {
2077 case 7: z *= (y + 6.0); /* FALLTHRU */
2078 case 6: z *= (y + 5.0); /* FALLTHRU */
2079 case 5: z *= (y + 4.0); /* FALLTHRU */
2080 case 4: z *= (y + 3.0); /* FALLTHRU */
2081 case 3:
2082 z *= (y + 2.0); /* FALLTHRU */
2083 r += reference_log(z);
2084 break;
2085 }
2086 /* 8.0 <= x < 2**58 */
2087 }
2088 else if (ix < 0x43900000)
2089 {
2090 t = reference_log(x);
2091 z = one / x;
2092 y = z * z;
2093 w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
2094 r = (x - half) * (t - one) + w;
2095 }
2096 else
2097 /* 2**58 <= x <= inf */
2098 r = x * (reference_log(x) - one);
2099 if (hx < 0) r = nadj - r;
2100 return r;
2101 }
2102
2103 #endif // _MSC_VER
2104
reference_assignment(double x)2105 double reference_assignment(double x) { return x; }
2106
reference_not(double x)2107 int reference_not(double x)
2108 {
2109 int r = !x;
2110 return r;
2111 }
2112
2113 #pragma mark -
2114 #pragma mark Double testing
2115
2116 #ifndef M_PIL
2117 #define M_PIL \
2118 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
2119 #endif
2120
2121 static long double reduce1l(long double x);
2122
2123 #ifdef __PPC__
2124 // Since long double on PPC is really extended precision double arithmetic
2125 // consisting of two doubles (a high and low). This form of long double has
2126 // the potential of representing a number with more than LDBL_MANT_DIG digits
2127 // such that reduction algorithm used for other architectures will not work.
2128 // Instead and alternate reduction method is used.
2129
reduce1l(long double x)2130 static long double reduce1l(long double x)
2131 {
2132 union {
2133 long double ld;
2134 double d[2];
2135 } u;
2136
2137 // Reduce the high and low halfs separately.
2138 u.ld = x;
2139 return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
2140 }
2141
2142 #else // !__PPC__
2143
reduce1l(long double x)2144 static long double reduce1l(long double x)
2145 {
2146 static long double unit_exp = 0;
2147 if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
2148
2149 if (reference_fabsl(x) >= unit_exp)
2150 {
2151 if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2152
2153 return 0.0L; // we patch up the sign for sinPi and cosPi later, since
2154 // they need different signs
2155 }
2156
2157 // Find the nearest multiple of 2
2158 const long double r = reference_copysignl(unit_exp, x);
2159 long double z = x + r;
2160 z -= r;
2161
2162 // subtract it from x. Value is now in the range -1 <= x <= 1
2163 return x - z;
2164 }
2165 #endif // __PPC__
2166
reference_acospil(long double x)2167 long double reference_acospil(long double x)
2168 {
2169 return reference_acosl(x) / M_PIL;
2170 }
reference_asinpil(long double x)2171 long double reference_asinpil(long double x)
2172 {
2173 return reference_asinl(x) / M_PIL;
2174 }
reference_atanpil(long double x)2175 long double reference_atanpil(long double x)
2176 {
2177 return reference_atanl(x) / M_PIL;
2178 }
reference_atan2pil(long double y,long double x)2179 long double reference_atan2pil(long double y, long double x)
2180 {
2181 return reference_atan2l(y, x) / M_PIL;
2182 }
reference_cospil(long double x)2183 long double reference_cospil(long double x)
2184 {
2185 if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
2186 {
2187 if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2188
2189 // Note this probably fails for odd values between 0x1.0p52 and
2190 // 0x1.0p53. However, when starting with single precision inputs, there
2191 // will be no odd values.
2192
2193 return 1.0L;
2194 }
2195
2196 x = reduce1l(x);
2197
2198 #if DBL_MANT_DIG >= LDBL_MANT_DIG
2199
2200 // phase adjust
2201 double xhi = 0.0;
2202 double xlo = 0.0;
2203 xhi = (double)x + 0.5;
2204
2205 if (reference_fabsl(x) > 0.5L)
2206 {
2207 xlo = xhi - x;
2208 xlo = 0.5 - xlo;
2209 }
2210 else
2211 {
2212 xlo = xhi - 0.5;
2213 xlo = x - xlo;
2214 }
2215
2216 // reduce to [-0.5, 0.5]
2217 if (xhi < -0.5)
2218 {
2219 xhi = -1.0 - xhi;
2220 xlo = -xlo;
2221 }
2222 else if (xhi > 0.5)
2223 {
2224 xhi = 1.0 - xhi;
2225 xlo = -xlo;
2226 }
2227
2228 // cosPi zeros are all +0
2229 if (xhi == 0.0 && xlo == 0.0) return 0.0;
2230
2231 xhi *= M_PI;
2232 xlo *= M_PI;
2233
2234 xhi += xlo;
2235
2236 return reference_sinl(xhi);
2237
2238 #else
2239 // phase adjust
2240 x += 0.5L;
2241
2242 // reduce to [-0.5, 0.5]
2243 if (x < -0.5L)
2244 x = -1.0L - x;
2245 else if (x > 0.5L)
2246 x = 1.0L - x;
2247
2248 // cosPi zeros are all +0
2249 if (x == 0.0L) return 0.0L;
2250
2251 return reference_sinl(x * M_PIL);
2252 #endif
2253 }
2254
reference_dividel(long double x,long double y)2255 long double reference_dividel(long double x, long double y)
2256 {
2257 double dx = x;
2258 double dy = y;
2259 return dx / dy;
2260 }
2261
2262 typedef struct
2263 {
2264 double hi, lo;
2265 } double_double;
2266
2267 // Split doubles_double into a series of consecutive 26-bit precise doubles and
2268 // a remainder. Note for later -- for multiplication, it might be better to
2269 // split each double into a power of two and two 26 bit portions
2270 // multiplication of a double double by a known power of
2271 // two is cheap. The current approach causes some inexact
2272 // arithmetic in mul_dd.
split_dd(double_double x,double_double * hi,double_double * lo)2273 static inline void split_dd(double_double x, double_double *hi,
2274 double_double *lo)
2275 {
2276 union {
2277 double d;
2278 cl_ulong u;
2279 } u;
2280 u.d = x.hi;
2281 u.u &= 0xFFFFFFFFF8000000ULL;
2282 hi->hi = u.d;
2283 x.hi -= u.d;
2284
2285 u.d = x.hi;
2286 u.u &= 0xFFFFFFFFF8000000ULL;
2287 hi->lo = u.d;
2288 x.hi -= u.d;
2289
2290 double temp = x.hi;
2291 x.hi += x.lo;
2292 x.lo -= x.hi - temp;
2293 u.d = x.hi;
2294 u.u &= 0xFFFFFFFFF8000000ULL;
2295 lo->hi = u.d;
2296 x.hi -= u.d;
2297
2298 lo->lo = x.hi + x.lo;
2299 }
2300
accum_d(double_double a,double b)2301 static inline double_double accum_d(double_double a, double b)
2302 {
2303 double temp;
2304 if (fabs(b) > fabs(a.hi))
2305 {
2306 temp = a.hi;
2307 a.hi += b;
2308 a.lo += temp - (a.hi - b);
2309 }
2310 else
2311 {
2312 temp = a.hi;
2313 a.hi += b;
2314 a.lo += b - (a.hi - temp);
2315 }
2316
2317 if (isnan(a.lo)) a.lo = 0.0;
2318
2319 return a;
2320 }
2321
add_dd(double_double a,double_double b)2322 static inline double_double add_dd(double_double a, double_double b)
2323 {
2324 double_double r = { -0.0 - 0.0 };
2325
2326 if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
2327 || 0.0 == b.hi)
2328 {
2329 r.hi = a.hi + b.hi;
2330 r.lo = a.lo + b.lo;
2331 if (isnan(r.lo)) r.lo = 0.0;
2332 return r;
2333 }
2334
2335 // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
2336 // |b.hi| > |b.lo|, so we don't have to do the first merge pass
2337 double terms[4] = { a.hi, b.hi, a.lo, b.lo };
2338 double temp;
2339
2340 // Sort hi terms
2341 if (fabs(terms[0]) < fabs(terms[1]))
2342 {
2343 temp = terms[0];
2344 terms[0] = terms[1];
2345 terms[1] = temp;
2346 }
2347 // sort lo terms
2348 if (fabs(terms[2]) < fabs(terms[3]))
2349 {
2350 temp = terms[2];
2351 terms[2] = terms[3];
2352 terms[3] = temp;
2353 }
2354 // Fix case where small high term is less than large low term
2355 if (fabs(terms[1]) < fabs(terms[2]))
2356 {
2357 temp = terms[1];
2358 terms[1] = terms[2];
2359 terms[2] = temp;
2360 }
2361
2362 // accumulate the results
2363 r.hi = terms[2] + terms[3];
2364 r.lo = terms[3] - (r.hi - terms[2]);
2365
2366 temp = r.hi;
2367 r.hi += terms[1];
2368 r.lo += temp - (r.hi - terms[1]);
2369
2370 temp = r.hi;
2371 r.hi += terms[0];
2372 r.lo += temp - (r.hi - terms[0]);
2373
2374 // canonicalize the result
2375 temp = r.hi;
2376 r.hi += r.lo;
2377 r.lo = r.lo - (r.hi - temp);
2378 if (isnan(r.lo)) r.lo = 0.0;
2379
2380 return r;
2381 }
2382
mul_dd(double_double a,double_double b)2383 static inline double_double mul_dd(double_double a, double_double b)
2384 {
2385 double_double result = { -0.0, -0.0 };
2386
2387 // Inf, nan and 0
2388 if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
2389 || 0.0 == b.hi)
2390 {
2391 result.hi = a.hi * b.hi;
2392 return result;
2393 }
2394
2395 double_double ah, al, bh, bl;
2396 split_dd(a, &ah, &al);
2397 split_dd(b, &bh, &bl);
2398
2399 double p0 = ah.hi * bh.hi; // exact (52 bits in product) 0
2400 double p1 = ah.hi * bh.lo; // exact (52 bits in product) 26
2401 double p2 = ah.lo * bh.hi; // exact (52 bits in product) 26
2402 double p3 = ah.lo * bh.lo; // exact (52 bits in product) 52
2403 double p4 = al.hi * bh.hi; // exact (52 bits in product) 52
2404 double p5 = al.hi * bh.lo; // exact (52 bits in product) 78
2405 double p6 = al.lo * bh.hi; // inexact (54 bits in product) 78
2406 double p7 = al.lo * bh.lo; // inexact (54 bits in product) 104
2407 double p8 = ah.hi * bl.hi; // exact (52 bits in product) 52
2408 double p9 = ah.hi * bl.lo; // inexact (54 bits in product) 78
2409 double pA = ah.lo * bl.hi; // exact (52 bits in product) 78
2410 double pB = ah.lo * bl.lo; // inexact (54 bits in product) 104
2411 double pC = al.hi * bl.hi; // exact (52 bits in product) 104
2412 // the last 3 terms are two low to appear in the result
2413
2414
2415 // take advantage of the known relative magnitudes of the partial products
2416 // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
2417 // sloppy about canonicalizing the double_doubles
2418 double_double t0 = { pA, pC };
2419 double_double t1 = { p9, pB };
2420 double_double t2 = { p6, p7 };
2421 double temp0, temp1, temp2;
2422
2423 t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
2424
2425 // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
2426 // double_doubles
2427 temp0 = t0.hi;
2428 temp1 = t1.hi;
2429 temp2 = t2.hi;
2430 t0.hi += p3;
2431 t1.hi += p4;
2432 t2.hi += p8;
2433 temp0 -= t0.hi - p3;
2434 temp1 -= t1.hi - p4;
2435 temp2 -= t2.hi - p8;
2436 t0.lo += temp0;
2437 t1.lo += temp1;
2438 t2.lo += temp2;
2439
2440 // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
2441 // double_doubles
2442 temp1 = t1.hi;
2443 temp2 = t2.hi;
2444 t1.hi += p1;
2445 t2.hi += p2;
2446 temp1 -= t1.hi - p1;
2447 temp2 -= t2.hi - p2;
2448 t1.lo += temp1;
2449 t2.lo += temp2;
2450
2451 // Combine accumulators to get the low bits of result
2452 t1 = add_dd(t1, add_dd(t2, t0));
2453
2454 // Add in MSB's, and round to precision
2455 return accum_d(t1, p0); // canonicalizes
2456 }
2457
2458
reference_exp10l(long double z)2459 long double reference_exp10l(long double z)
2460 {
2461 const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
2462 HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
2463 double_double x;
2464 int j;
2465
2466 // Handle NaNs
2467 if (isnan(z)) return z;
2468
2469 // init x
2470 x.hi = z;
2471 x.lo = z - x.hi;
2472
2473
2474 // 10**x = exp2( x * log2(10) )
2475
2476 x = mul_dd(x, log2_10); // x * log2(10)
2477
2478 // Deal with overflow and underflow for exp2(x) stage next
2479 if (x.hi >= 1025) return INFINITY;
2480
2481 if (x.hi < -1075 - 24) return +0.0;
2482
2483 // find nearest integer to x
2484 int i = (int)rint(x.hi);
2485
2486 // x now holds fractional part. The result would be then 2**i * exp2( x )
2487 x.hi -= i;
2488
2489 // We could attempt to find a minimax polynomial for exp2(x) over the range
2490 // x = [-0.5, 0.5]. However, this would converge very slowly near the
2491 // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
2492 // requiring something like a 20th order polynomial to get 53 + 24 bits of
2493 // precision. Instead we further reduce the range to [-1/32, 1/32] by
2494 // observing that
2495 //
2496 // 2**(a+b) = 2**a * 2**b
2497 //
2498 // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
2499 // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
2500 // value of n/16 from x.
2501 const double_double corrections[17] = {
2502 { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
2503 HEX_DBL(-, 1, bdd3413b26456, -, 55) },
2504 { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
2505 HEX_DBL(-, 1, 41577ee04992f, -, 56) },
2506 { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
2507 HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
2508 { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
2509 HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
2510 { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
2511 HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
2512 { HEX_DBL(+, 1, c199bdd85529c, -, 1),
2513 HEX_DBL(+, 1, 11065895048dd, -, 56) },
2514 { HEX_DBL(+, 1, d5818dcfba487, -, 1),
2515 HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
2516 { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
2517 HEX_DBL(-, 1, e9c23179c2893, -, 55) },
2518 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
2519 { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
2520 HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
2521 { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
2522 HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
2523 { HEX_DBL(+, 1, 2387a6e756238, +, 0),
2524 HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
2525 { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
2526 HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
2527 { HEX_DBL(+, 1, 3dea64c123422, +, 0),
2528 HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
2529 { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
2530 HEX_DBL(+, 1, d4397afec42e2, -, 56) },
2531 { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
2532 HEX_DBL(+, 1, 6324c054647ad, -, 54) },
2533 { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
2534 HEX_DBL(-, 1, bdd3413b26456, -, 54) }
2535
2536 };
2537 int index = (int)rint(x.hi * 16.0);
2538 x.hi -= (double)index * 0.0625;
2539
2540 // canonicalize x
2541 double temp = x.hi;
2542 x.hi += x.lo;
2543 x.lo -= x.hi - temp;
2544
2545 // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max
2546 // Error: 2 * 0x1.e112p-87
2547 const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
2548 HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
2549 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
2550 HEX_DBL(-, 1, 5e4987a631846, -, 57) },
2551 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
2552 HEX_DBL(-, 1, d323200a05713, -, 59) },
2553 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
2554 HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
2555 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
2556 HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
2557 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
2558 HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
2559 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
2560 HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
2561 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
2562 HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
2563 { HEX_DBL(+, 1, b52502b5e953, -, 24),
2564 HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
2565 size_t count = sizeof(c) / sizeof(c[0]);
2566
2567 // Do polynomial
2568 double_double r = c[count - 1];
2569 for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
2570
2571 // unwind approximation
2572 r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1
2573
2574 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
2575 // exp2(x) = (r + 1) * correction = r * correction + correction
2576 r = mul_dd(r, corrections[index + 8]);
2577 r = add_dd(r, corrections[index + 8]);
2578
2579 // Format result for output:
2580
2581 // Get mantissa
2582 long double m = ((long double)r.hi + (long double)r.lo);
2583
2584 // Handle a pesky overflow cases when long double = double
2585 if (i > 512)
2586 {
2587 m *= HEX_DBL(+, 1, 0, +, 512);
2588 i -= 512;
2589 }
2590 else if (i < -512)
2591 {
2592 m *= HEX_DBL(+, 1, 0, -, 512);
2593 i += 512;
2594 }
2595
2596 return m * ldexpl(1.0L, i);
2597 }
2598
2599
fallback_frexp(double x,int * iptr)2600 static double fallback_frexp(double x, int *iptr)
2601 {
2602 cl_ulong u, v;
2603 double fu, fv;
2604
2605 memcpy(&u, &x, sizeof(u));
2606
2607 cl_ulong exponent = u & 0x7ff0000000000000ULL;
2608 cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
2609
2610 // add 1 to the exponent
2611 exponent += 0x0010000000000000ULL;
2612
2613 if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
2614 { // subnormal, NaN, Inf
2615 mantissa |= 0x3fe0000000000000ULL;
2616
2617 v = mantissa & 0xfff0000000000000ULL;
2618 u = mantissa;
2619 memcpy(&fv, &v, sizeof(v));
2620 memcpy(&fu, &u, sizeof(u));
2621
2622 fu -= fv;
2623
2624 memcpy(&v, &fv, sizeof(v));
2625 memcpy(&u, &fu, sizeof(u));
2626
2627 exponent = u & 0x7ff0000000000000ULL;
2628 mantissa = u & ~0x7ff0000000000000ULL;
2629
2630 *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
2631 u = mantissa | 0x3fe0000000000000ULL;
2632 memcpy(&fu, &u, sizeof(u));
2633 return fu;
2634 }
2635
2636 *iptr = (exponent >> 52) - 1023;
2637 u = mantissa | 0x3fe0000000000000ULL;
2638 memcpy(&fu, &u, sizeof(u));
2639 return fu;
2640 }
2641
2642 // Assumes zeros, infinities and NaNs handed elsewhere
extract(double x,cl_ulong * mant)2643 static inline int extract(double x, cl_ulong *mant)
2644 {
2645 static double (*frexpp)(double, int *) = NULL;
2646 int e;
2647
2648 // verify that frexp works properly
2649 if (NULL == frexpp)
2650 {
2651 if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
2652 frexpp = frexp;
2653 else
2654 frexpp = fallback_frexp;
2655 }
2656
2657 *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
2658 return e - 1;
2659 }
2660
2661 // Return 128-bit product of a*b as (hi << 64) + lo
mul128(cl_ulong a,cl_ulong b,cl_ulong * hi,cl_ulong * lo)2662 static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
2663 {
2664 cl_ulong alo = a & 0xffffffffULL;
2665 cl_ulong ahi = a >> 32;
2666 cl_ulong blo = b & 0xffffffffULL;
2667 cl_ulong bhi = b >> 32;
2668 cl_ulong aloblo = alo * blo;
2669 cl_ulong alobhi = alo * bhi;
2670 cl_ulong ahiblo = ahi * blo;
2671 cl_ulong ahibhi = ahi * bhi;
2672
2673 alobhi += (aloblo >> 32)
2674 + (ahiblo
2675 & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) =
2676 // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2677 *hi = ahibhi + (alobhi >> 32)
2678 + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) =
2679 // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2680 *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
2681 }
2682
round_to_nearest_even_double(cl_ulong hi,cl_ulong lo,int exponent)2683 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
2684 int exponent)
2685 {
2686 union {
2687 cl_ulong u;
2688 cl_double d;
2689 } u;
2690
2691 // edges
2692 if (exponent > 1023) return INFINITY;
2693 if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
2694 return HEX_DBL(+, 1, 0, -, 1074);
2695 if (exponent <= -1075) return 0.0;
2696
2697 // Figure out which bits go where
2698 int shift = 11;
2699 if (exponent < -1022)
2700 {
2701 shift -= 1022 + exponent; // subnormal: shift is not 52
2702 exponent = -1023; // set exponent to 0
2703 }
2704 else
2705 hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
2706 // it.
2707
2708 // Assemble the double (round toward zero)
2709 u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
2710
2711 // put a representation of the residual bits into hi
2712 hi <<= (64 - shift);
2713 hi |= lo >> shift;
2714 lo <<= (64 - shift);
2715 hi |= lo != 0;
2716
2717 // round to nearest, ties to even
2718 if (hi < 0x8000000000000000ULL) return u.d;
2719 if (hi == 0x8000000000000000ULL)
2720 u.u += u.u & 1ULL;
2721 else
2722 u.u++;
2723
2724 return u.d;
2725 }
2726
2727 // Shift right. Bits lost on the right will be OR'd together and OR'd with the
2728 // LSB
shift_right_sticky_128(cl_ulong * hi,cl_ulong * lo,int shift)2729 static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
2730 {
2731 cl_ulong sticky = 0;
2732 cl_ulong h = *hi;
2733 cl_ulong l = *lo;
2734
2735 if (shift >= 64)
2736 {
2737 shift -= 64;
2738 sticky = 0 != lo;
2739 l = h;
2740 h = 0;
2741 if (shift >= 64)
2742 {
2743 sticky |= (0 != l);
2744 l = 0;
2745 }
2746 else
2747 {
2748 sticky |= (0 != (l << (64 - shift)));
2749 l >>= shift;
2750 }
2751 }
2752 else
2753 {
2754 sticky |= (0 != (l << (64 - shift)));
2755 l >>= shift;
2756 l |= h << (64 - shift);
2757 h >>= shift;
2758 }
2759
2760 *lo = l | sticky;
2761 *hi = h;
2762 }
2763
2764 // 128-bit add of ((*hi << 64) + *lo) + ((chi << 64) + clo)
2765 // If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
2766 // with the LSB
add128(cl_ulong * hi,cl_ulong * lo,cl_ulong chi,cl_ulong clo,int * exponent)2767 static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
2768 cl_ulong clo, int *exponent)
2769 {
2770 cl_ulong carry, carry2;
2771 // extended precision add
2772 clo = add_carry(*lo, clo, &carry);
2773 chi = add_carry(*hi, chi, &carry2);
2774 chi = add_carry(chi, carry, &carry);
2775
2776 // If we overflowed the 128 bit result
2777 if (carry || carry2)
2778 {
2779 carry = clo & 1; // set aside low bit
2780 clo >>= 1; // right shift low 1
2781 clo |= carry; // or back in the low bit, so we don't come to believe
2782 // this is an exact half way case for rounding
2783 clo |= chi << 63; // move lowest high bit into highest bit of lo
2784 chi >>= 1; // right shift hi
2785 chi |= 0x8000000000000000ULL; // move the carry bit into hi.
2786 *exponent = *exponent + 1;
2787 }
2788
2789 *hi = chi;
2790 *lo = clo;
2791 }
2792
2793 // 128-bit subtract of ((chi << 64) + clo) - ((*hi << 64) + *lo)
sub128(cl_ulong * chi,cl_ulong * clo,cl_ulong hi,cl_ulong lo,cl_ulong * signC,int * expC)2794 static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
2795 cl_ulong lo, cl_ulong *signC, int *expC)
2796 {
2797 cl_ulong rHi = *chi;
2798 cl_ulong rLo = *clo;
2799 cl_ulong carry, carry2;
2800
2801 // extended precision subtract
2802 rLo = sub_carry(rLo, lo, &carry);
2803 rHi = sub_carry(rHi, hi, &carry2);
2804 rHi = sub_carry(rHi, carry, &carry);
2805
2806 // Check for sign flip
2807 if (carry || carry2)
2808 {
2809 *signC ^= 0x8000000000000000ULL;
2810
2811 // negate rLo, rHi: -x = (x ^ -1) + 1
2812 rLo ^= -1ULL;
2813 rHi ^= -1ULL;
2814 rLo++;
2815 rHi += 0 == rLo;
2816 }
2817
2818 // normalize -- move the most significant non-zero bit to the MSB, and
2819 // adjust exponent accordingly
2820 if (rHi == 0)
2821 {
2822 rHi = rLo;
2823 *expC = *expC - 64;
2824 rLo = 0;
2825 }
2826
2827 if (rHi)
2828 {
2829 int shift = 32;
2830 cl_ulong test = 1ULL << 32;
2831 while (0 == (rHi & 0x8000000000000000ULL))
2832 {
2833 if (rHi < test)
2834 {
2835 rHi <<= shift;
2836 rHi |= rLo >> (64 - shift);
2837 rLo <<= shift;
2838 *expC = *expC - shift;
2839 }
2840 shift >>= 1;
2841 test <<= shift;
2842 }
2843 }
2844 else
2845 {
2846 // zero
2847 *expC = INT_MIN;
2848 *signC = 0;
2849 }
2850
2851
2852 *chi = rHi;
2853 *clo = rLo;
2854 }
2855
reference_fmal(long double x,long double y,long double z)2856 long double reference_fmal(long double x, long double y, long double z)
2857 {
2858 static const cl_ulong kMSB = 0x8000000000000000ULL;
2859
2860 // cast values back to double. This is an exact function, so
2861 double a = x;
2862 double b = y;
2863 double c = z;
2864
2865 // Make bits accessible
2866 union {
2867 cl_ulong u;
2868 cl_double d;
2869 } ua;
2870 ua.d = a;
2871 union {
2872 cl_ulong u;
2873 cl_double d;
2874 } ub;
2875 ub.d = b;
2876 union {
2877 cl_ulong u;
2878 cl_double d;
2879 } uc;
2880 uc.d = c;
2881
2882 // deal with Nans, infinities and zeros
2883 if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
2884 || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
2885 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
2886 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
2887 {
2888 if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
2889
2890 a = (double)reference_multiplyl(
2891 a, b); // some risk that the compiler will insert a non-compliant
2892 // fma here on some platforms.
2893 return reference_addl(
2894 a,
2895 c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
2896 }
2897
2898 // extract exponent and mantissa
2899 // exponent is a standard unbiased signed integer
2900 // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
2901 cl_ulong mantA, mantB, mantC;
2902 int expA = extract(a, &mantA);
2903 int expB = extract(b, &mantB);
2904 int expC = extract(c, &mantC);
2905 cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
2906 // decide if we are adding or subtracting
2907
2908 // exact product of A and B
2909 int exponent = expA + expB;
2910 cl_ulong sign = (ua.u ^ ub.u) & kMSB;
2911 cl_ulong hi, lo;
2912 mul128(mantA, mantB, &hi, &lo);
2913
2914 // renormalize
2915 if (0 == (kMSB & hi))
2916 {
2917 hi <<= 1;
2918 hi |= lo >> 63;
2919 lo <<= 1;
2920 }
2921 else
2922 exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
2923 // exponent increased.
2924
2925 // infinite precision add
2926 cl_ulong chi = mantC;
2927 cl_ulong clo = 0;
2928
2929 if (exponent >= expC)
2930 {
2931 // Normalize C relative to the product
2932 if (exponent > expC)
2933 shift_right_sticky_128(&chi, &clo, exponent - expC);
2934
2935 // Add
2936 if (sign ^ signC)
2937 sub128(&hi, &lo, chi, clo, &sign, &exponent);
2938 else
2939 add128(&hi, &lo, chi, clo, &exponent);
2940 }
2941 else
2942 {
2943 // Shift the product relative to C so that their exponents match
2944 shift_right_sticky_128(&hi, &lo, expC - exponent);
2945
2946 // add
2947 if (sign ^ signC)
2948 sub128(&chi, &clo, hi, lo, &signC, &expC);
2949 else
2950 add128(&chi, &clo, hi, lo, &expC);
2951
2952 hi = chi;
2953 lo = clo;
2954 exponent = expC;
2955 sign = signC;
2956 }
2957
2958 // round
2959 ua.d = round_to_nearest_even_double(hi, lo, exponent);
2960
2961 // Set the sign
2962 ua.u |= sign;
2963
2964 return ua.d;
2965 }
2966
2967
reference_madl(long double a,long double b,long double c)2968 long double reference_madl(long double a, long double b, long double c)
2969 {
2970 return a * b + c;
2971 }
2972
reference_recipl(long double x)2973 long double reference_recipl(long double x) { return 1.0L / x; }
2974
reference_rootnl(long double x,int i)2975 long double reference_rootnl(long double x, int i)
2976 {
2977 // rootn ( x, 0 ) returns a NaN.
2978 if (0 == i) return cl_make_nan();
2979
2980 // rootn ( x, n ) returns a NaN for x < 0 and n is even.
2981 if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
2982
2983 if (isinf(x))
2984 {
2985 if (i < 0) return reference_copysignl(0.0L, x);
2986
2987 return x;
2988 }
2989
2990 if (x == 0.0)
2991 {
2992 switch (i & 0x80000001)
2993 {
2994 // rootn ( +-0, n ) is +0 for even n > 0.
2995 case 0: return 0.0L;
2996
2997 // rootn ( +-0, n ) is +-0 for odd n > 0.
2998 case 1: return x;
2999
3000 // rootn ( +-0, n ) is +inf for even n < 0.
3001 case 0x80000000: return INFINITY;
3002
3003 // rootn ( +-0, n ) is +-inf for odd n < 0.
3004 case 0x80000001: return copysign(INFINITY, x);
3005 }
3006 }
3007
3008 if (i == 1) return x;
3009
3010 if (i == -1) return 1.0 / x;
3011
3012 long double sign = x;
3013 x = reference_fabsl(x);
3014 double iHi, iLo;
3015 DivideDD(&iHi, &iLo, 1.0, i);
3016 x = reference_powl(x, iHi) * reference_powl(x, iLo);
3017
3018 return reference_copysignl(x, sign);
3019 }
3020
reference_rsqrtl(long double x)3021 long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
3022
reference_sinpil(long double x)3023 long double reference_sinpil(long double x)
3024 {
3025 double r = reduce1l(x);
3026
3027 // reduce to [-0.5, 0.5]
3028 if (r < -0.5L)
3029 r = -1.0L - r;
3030 else if (r > 0.5L)
3031 r = 1.0L - r;
3032
3033 // sinPi zeros have the same sign as x
3034 if (r == 0.0L) return reference_copysignl(0.0L, x);
3035
3036 return reference_sinl(r * M_PIL);
3037 }
3038
reference_tanpil(long double x)3039 long double reference_tanpil(long double x)
3040 {
3041 // set aside the sign (allows us to preserve sign of -0)
3042 long double sign = reference_copysignl(1.0L, x);
3043 long double z = reference_fabsl(x);
3044
3045 // if big and even -- caution: only works if x only has single precision
3046 if (z >= HEX_LDBL(+, 1, 0, +, 53))
3047 {
3048 if (z == INFINITY) return x - x; // nan
3049
3050 return reference_copysignl(
3051 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n.
3052 }
3053
3054 // reduce to the range [ -0.5, 0.5 ]
3055 long double nearest =
3056 reference_rintl(z); // round to nearest even places n + 0.5 values in
3057 // the right place for us
3058 int64_t i =
3059 (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
3060 z -= nearest;
3061
3062 // correction for odd integer x for the right sign of zero
3063 if ((i & 1) && z == 0.0L) sign = -sign;
3064
3065 // track changes to the sign
3066 sign *= reference_copysignl(1.0L, z); // really should just be an xor
3067 z = reference_fabsl(z); // remove the sign again
3068
3069 // reduce once more
3070 // If we don't do this, rounding error in z * M_PI will cause us not to
3071 // return infinities properly
3072 if (z > 0.25L)
3073 {
3074 z = 0.5L - z;
3075 return sign
3076 / reference_tanl(z
3077 * M_PIL); // use system tan to get the right result
3078 }
3079
3080 //
3081 return sign
3082 * reference_tanl(z * M_PIL); // use system tan to get the right result
3083 }
3084
reference_pownl(long double x,int i)3085 long double reference_pownl(long double x, int i)
3086 {
3087 return reference_powl(x, (long double)i);
3088 }
3089
reference_powrl(long double x,long double y)3090 long double reference_powrl(long double x, long double y)
3091 {
3092 // powr ( x, y ) returns NaN for x < 0.
3093 if (x < 0.0L) return cl_make_nan();
3094
3095 // powr ( x, NaN ) returns the NaN for x >= 0.
3096 // powr ( NaN, y ) returns the NaN.
3097 if (isnan(x) || isnan(y))
3098 return x + y; // Note: behavior different here than for pow(1,NaN),
3099 // pow(NaN, 0)
3100
3101 if (x == 1.0L)
3102 {
3103 // powr ( +1, +-inf ) returns NaN.
3104 if (reference_fabsl(y) == INFINITY) return cl_make_nan();
3105
3106 // powr ( +1, y ) is 1 for finite y. (NaN handled above)
3107 return 1.0L;
3108 }
3109
3110 if (y == 0.0L)
3111 {
3112 // powr ( +inf, +-0 ) returns NaN.
3113 // powr ( +-0, +-0 ) returns NaN.
3114 if (x == 0.0L || x == INFINITY) return cl_make_nan();
3115
3116 // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already
3117 // handled above)
3118 return 1.0L;
3119 }
3120
3121 if (x == 0.0L)
3122 {
3123 // powr ( +-0, -inf) is +inf.
3124 // powr ( +-0, y ) is +inf for finite y < 0.
3125 if (y < 0.0L) return INFINITY;
3126
3127 // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above)
3128 return 0.0L;
3129 }
3130
3131 return reference_powl(x, y);
3132 }
3133
reference_addl(long double x,long double y)3134 long double reference_addl(long double x, long double y)
3135 {
3136 volatile double a = (double)x;
3137 volatile double b = (double)y;
3138
3139 #if defined(__SSE2__)
3140 // defeat x87
3141 __m128d va = _mm_set_sd((double)a);
3142 __m128d vb = _mm_set_sd((double)b);
3143 va = _mm_add_sd(va, vb);
3144 _mm_store_sd((double *)&a, va);
3145 #else
3146 a += b;
3147 #endif
3148 return (long double)a;
3149 }
3150
reference_subtractl(long double x,long double y)3151 long double reference_subtractl(long double x, long double y)
3152 {
3153 volatile double a = (double)x;
3154 volatile double b = (double)y;
3155
3156 #if defined(__SSE2__)
3157 // defeat x87
3158 __m128d va = _mm_set_sd((double)a);
3159 __m128d vb = _mm_set_sd((double)b);
3160 va = _mm_sub_sd(va, vb);
3161 _mm_store_sd((double *)&a, va);
3162 #else
3163 a -= b;
3164 #endif
3165 return (long double)a;
3166 }
3167
reference_multiplyl(long double x,long double y)3168 long double reference_multiplyl(long double x, long double y)
3169 {
3170 volatile double a = (double)x;
3171 volatile double b = (double)y;
3172
3173 #if defined(__SSE2__)
3174 // defeat x87
3175 __m128d va = _mm_set_sd((double)a);
3176 __m128d vb = _mm_set_sd((double)b);
3177 va = _mm_mul_sd(va, vb);
3178 _mm_store_sd((double *)&a, va);
3179 #else
3180 a *= b;
3181 #endif
3182 return (long double)a;
3183 }
3184
reference_lgamma_rl(long double x,int * signp)3185 long double reference_lgamma_rl(long double x, int *signp)
3186 {
3187 *signp = 0;
3188 return x;
3189 }
3190
reference_isequall(long double x,long double y)3191 int reference_isequall(long double x, long double y) { return x == y; }
reference_isfinitel(long double x)3192 int reference_isfinitel(long double x) { return 0 != isfinite(x); }
reference_isgreaterl(long double x,long double y)3193 int reference_isgreaterl(long double x, long double y) { return x > y; }
reference_isgreaterequall(long double x,long double y)3194 int reference_isgreaterequall(long double x, long double y) { return x >= y; }
reference_isinfl(long double x)3195 int reference_isinfl(long double x) { return 0 != isinf(x); }
reference_islessl(long double x,long double y)3196 int reference_islessl(long double x, long double y) { return x < y; }
reference_islessequall(long double x,long double y)3197 int reference_islessequall(long double x, long double y) { return x <= y; }
3198 #if defined(__INTEL_COMPILER)
reference_islessgreaterl(long double x,long double y)3199 int reference_islessgreaterl(long double x, long double y)
3200 {
3201 return 0 != islessgreaterl(x, y);
3202 }
3203 #else
reference_islessgreaterl(long double x,long double y)3204 int reference_islessgreaterl(long double x, long double y)
3205 {
3206 return 0 != islessgreater(x, y);
3207 }
3208 #endif
reference_isnanl(long double x)3209 int reference_isnanl(long double x) { return 0 != isnan(x); }
reference_isnormall(long double x)3210 int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
reference_isnotequall(long double x,long double y)3211 int reference_isnotequall(long double x, long double y) { return x != y; }
reference_isorderedl(long double x,long double y)3212 int reference_isorderedl(long double x, long double y)
3213 {
3214 return x == x && y == y;
3215 }
reference_isunorderedl(long double x,long double y)3216 int reference_isunorderedl(long double x, long double y)
3217 {
3218 return isnan(x) || isnan(y);
3219 }
3220 #if defined(__INTEL_COMPILER)
reference_signbitl(long double x)3221 int reference_signbitl(long double x) { return 0 != signbitl(x); }
3222 #else
reference_signbitl(long double x)3223 int reference_signbitl(long double x) { return 0 != signbit(x); }
3224 #endif
3225 long double reference_copysignl(long double x, long double y);
3226 long double reference_roundl(long double x);
3227 long double reference_cbrtl(long double x);
3228
reference_copysignl(long double x,long double y)3229 long double reference_copysignl(long double x, long double y)
3230 {
3231 // We hope that the long double to double conversion proceeds with sign
3232 // fidelity, even for zeros and NaNs
3233 union {
3234 double d;
3235 cl_ulong u;
3236 } u;
3237 u.d = (double)y;
3238
3239 x = reference_fabsl(x);
3240 if (u.u >> 63) x = -x;
3241
3242 return x;
3243 }
3244
reference_roundl(long double x)3245 long double reference_roundl(long double x)
3246 {
3247 // Since we are just using this to verify double precision, we can
3248 // use the double precision copysign here
3249
3250 #if defined(__MINGW32__) && defined(__x86_64__)
3251 long double absx = reference_fabsl(x);
3252 if (absx < 0.5L) return reference_copysignl(0.0L, x);
3253 #endif
3254 return round((double)x);
3255 }
3256
reference_truncl(long double x)3257 long double reference_truncl(long double x)
3258 {
3259 // Since we are just using this to verify double precision, we can
3260 // use the double precision copysign here
3261 return trunc((double)x);
3262 }
3263
3264 static long double reference_scalblnl(long double x, long n);
3265
reference_cbrtl(long double x)3266 long double reference_cbrtl(long double x)
3267 {
3268 double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
3269 double ylo = HEX_DBL(+, 1, 558, -, 56);
3270
3271 double fabsx = reference_fabs(x);
3272
3273 if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
3274
3275 double log2x_hi, log2x_lo;
3276
3277 // extended precision log .... accurate to at least 64-bits + couple of
3278 // guard bits
3279 __log2_ep(&log2x_hi, &log2x_lo, fabsx);
3280
3281 double ylog2x_hi, ylog2x_lo;
3282
3283 double y_hi = yhi;
3284 double y_lo = ylo;
3285
3286 // compute product of y*log2(x)
3287 MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
3288
3289 long double powxy;
3290 if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
3291 {
3292 powxy =
3293 reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
3294 }
3295 else
3296 {
3297 // separate integer + fractional part
3298 long int m = lrint(ylog2x_hi);
3299 AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
3300
3301 // revert to long double arithemtic
3302 long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
3303 powxy = reference_exp2l(ylog2x);
3304 powxy = reference_scalblnl(powxy, m);
3305 }
3306
3307 return reference_copysignl(powxy, x);
3308 }
3309
reference_rintl(long double x)3310 long double reference_rintl(long double x)
3311 {
3312 #if defined(__PPC__)
3313 // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
3314 // mantissa can represent more than LDBL_MANT_DIG binary digits.
3315 x = rintl(x);
3316 #else
3317 static long double magic[2] = { 0.0L, 0.0L };
3318
3319 if (0.0L == magic[0])
3320 {
3321 magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
3322 magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
3323 }
3324
3325 if (reference_fabsl(x) < magic[0] && x != 0.0L)
3326 {
3327 long double m = magic[x < 0];
3328 x += m;
3329 x -= m;
3330 }
3331 #endif // __PPC__
3332 return x;
3333 }
3334
3335 // extended precision sqrt using newton iteration on 1/sqrt(x).
3336 // Final result is computed as x * 1/sqrt(x)
__sqrt_ep(double * rhi,double * rlo,double xhi,double xlo)3337 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
3338 {
3339 // approximate reciprocal sqrt
3340 double thi = 1.0 / sqrt(xhi);
3341 double tlo = 0.0;
3342
3343 // One newton iteration in double-double
3344 double yhi, ylo;
3345 MulDD(&yhi, &ylo, thi, tlo, thi, tlo);
3346 MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo);
3347 AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0);
3348 MulDD(&yhi, &ylo, yhi, ylo, thi, tlo);
3349 MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0);
3350
3351 MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
3352 }
3353
reference_acoshl(long double x)3354 long double reference_acoshl(long double x)
3355 {
3356 /*
3357 * ====================================================
3358 * This function derived from fdlibm http://www.netlib.org
3359 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3360 *
3361 * Developed at SunSoft, a Sun Microsystems, Inc. business.
3362 * Permission to use, copy, modify, and distribute this
3363 * software is freely granted, provided that this notice
3364 * is preserved.
3365 * ====================================================
3366 *
3367 */
3368 if (isnan(x) || isinf(x)) return x + fabsl(x);
3369
3370 if (x < 1.0L) return cl_make_nan();
3371
3372 if (x == 1.0L) return 0.0L;
3373
3374 if (x > HEX_LDBL(+, 1, 0, +, 60))
3375 return reference_logl(x) + 0.693147180559945309417232121458176568L;
3376
3377 if (x > 2.0L)
3378 return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
3379
3380 double hi, lo;
3381 MulD(&hi, &lo, x, x);
3382 AddDD(&hi, &lo, hi, lo, -1.0, 0.0);
3383 __sqrt_ep(&hi, &lo, hi, lo);
3384 AddDD(&hi, &lo, hi, lo, x, 0.0);
3385 double correction = lo / hi;
3386 __log2_ep(&hi, &lo, hi);
3387 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
3388 double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
3389 MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
3390 AddDD(&hi, &lo, hi, lo, correction, 0.0);
3391
3392 return hi + lo;
3393 }
3394
reference_asinhl(long double x)3395 long double reference_asinhl(long double x)
3396 {
3397 long double cutoff = 0.0L;
3398 const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
3399
3400 if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
3401
3402 if (isnan(x) || isinf(x)) return x + x;
3403
3404 long double absx = reference_fabsl(x);
3405 if (absx < cutoff) return x;
3406
3407 long double sign = reference_copysignl(1.0L, x);
3408
3409 if (absx <= 4.0 / 3.0)
3410 {
3411 return sign
3412 * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
3413 }
3414 else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
3415 {
3416 return sign
3417 * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
3418 }
3419 else
3420 {
3421 return sign * (reference_logl(absx) + ln2);
3422 }
3423 }
3424
reference_atanhl(long double x)3425 long double reference_atanhl(long double x)
3426 {
3427 /*
3428 * ====================================================
3429 * This function is from fdlibm: http://www.netlib.org
3430 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3431 *
3432 * Developed at SunSoft, a Sun Microsystems, Inc. business.
3433 * Permission to use, copy, modify, and distribute this
3434 * software is freely granted, provided that this notice
3435 * is preserved.
3436 * ====================================================
3437 */
3438 if (isnan(x)) return x + x;
3439
3440 long double signed_half = reference_copysignl(0.5L, x);
3441 x = reference_fabsl(x);
3442 if (x > 1.0L) return cl_make_nan();
3443
3444 if (x < 0.5L)
3445 return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
3446
3447 return signed_half * reference_log1pl(2.0L * x / (1 - x));
3448 }
3449
reference_exp2l(long double z)3450 long double reference_exp2l(long double z)
3451 {
3452 double_double x;
3453 int j;
3454
3455 // Handle NaNs
3456 if (isnan(z)) return z;
3457
3458 // init x
3459 x.hi = z;
3460 x.lo = z - x.hi;
3461
3462 // Deal with overflow and underflow for exp2(x) stage next
3463 if (x.hi >= 1025) return INFINITY;
3464
3465 if (x.hi < -1075 - 24) return +0.0;
3466
3467 // find nearest integer to x
3468 int i = (int)rint(x.hi);
3469
3470 // x now holds fractional part. The result would be then 2**i * exp2( x )
3471 x.hi -= i;
3472
3473 // We could attempt to find a minimax polynomial for exp2(x) over the range
3474 // x = [-0.5, 0.5]. However, this would converge very slowly near the
3475 // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
3476 // requiring something like a 20th order polynomial to get 53 + 24 bits of
3477 // precision. Instead we further reduce the range to [-1/32, 1/32] by
3478 // observing that
3479 //
3480 // 2**(a+b) = 2**a * 2**b
3481 //
3482 // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
3483 // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
3484 // value of n/16 from x.
3485 const double_double corrections[17] = {
3486 { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
3487 HEX_DBL(-, 1, bdd3413b26456, -, 55) },
3488 { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
3489 HEX_DBL(-, 1, 41577ee04992f, -, 56) },
3490 { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
3491 HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
3492 { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
3493 HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
3494 { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
3495 HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
3496 { HEX_DBL(+, 1, c199bdd85529c, -, 1),
3497 HEX_DBL(+, 1, 11065895048dd, -, 56) },
3498 { HEX_DBL(+, 1, d5818dcfba487, -, 1),
3499 HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
3500 { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
3501 HEX_DBL(-, 1, e9c23179c2893, -, 55) },
3502 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
3503 { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
3504 HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
3505 { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
3506 HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
3507 { HEX_DBL(+, 1, 2387a6e756238, +, 0),
3508 HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
3509 { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
3510 HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
3511 { HEX_DBL(+, 1, 3dea64c123422, +, 0),
3512 HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
3513 { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
3514 HEX_DBL(+, 1, d4397afec42e2, -, 56) },
3515 { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
3516 HEX_DBL(+, 1, 6324c054647ad, -, 54) },
3517 { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
3518 HEX_DBL(-, 1, bdd3413b26456, -, 54) }
3519 };
3520 int index = (int)rint(x.hi * 16.0);
3521 x.hi -= (double)index * 0.0625;
3522
3523 // canonicalize x
3524 double temp = x.hi;
3525 x.hi += x.lo;
3526 x.lo -= x.hi - temp;
3527
3528 // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max
3529 // Error: 2 * 0x1.e112p-87
3530 const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
3531 HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
3532 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
3533 HEX_DBL(-, 1, 5e4987a631846, -, 57) },
3534 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
3535 HEX_DBL(-, 1, d323200a05713, -, 59) },
3536 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
3537 HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
3538 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
3539 HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
3540 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
3541 HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
3542 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
3543 HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
3544 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
3545 HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
3546 { HEX_DBL(+, 1, b52502b5e953, -, 24),
3547 HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
3548 size_t count = sizeof(c) / sizeof(c[0]);
3549
3550 // Do polynomial
3551 double_double r = c[count - 1];
3552 for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
3553
3554 // unwind approximation
3555 r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1
3556
3557 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
3558 // exp2(x) = (r + 1) * correction = r * correction + correction
3559 r = mul_dd(r, corrections[index + 8]);
3560 r = add_dd(r, corrections[index + 8]);
3561
3562 // Format result for output:
3563
3564 // Get mantissa
3565 long double m = ((long double)r.hi + (long double)r.lo);
3566
3567 // Handle a pesky overflow cases when long double = double
3568 if (i > 512)
3569 {
3570 m *= HEX_DBL(+, 1, 0, +, 512);
3571 i -= 512;
3572 }
3573 else if (i < -512)
3574 {
3575 m *= HEX_DBL(+, 1, 0, -, 512);
3576 i += 512;
3577 }
3578
3579 return m * ldexpl(1.0L, i);
3580 }
3581
reference_expm1l(long double x)3582 long double reference_expm1l(long double x)
3583 {
3584 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3585 // unimplemented
3586 return x;
3587 #else
3588 if (reference_isnanl(x)) return x;
3589
3590 if (x > 710) return INFINITY;
3591
3592 long double y = expm1l(x);
3593
3594 // Range of expm1l is -1.0L to +inf. Negative inf
3595 // on a few Linux platforms is clearly the wrong sign.
3596 if (reference_isinfl(y)) y = INFINITY;
3597
3598 return y;
3599 #endif
3600 }
3601
reference_fmaxl(long double x,long double y)3602 long double reference_fmaxl(long double x, long double y)
3603 {
3604 if (isnan(y)) return x;
3605
3606 return x >= y ? x : y;
3607 }
3608
reference_fminl(long double x,long double y)3609 long double reference_fminl(long double x, long double y)
3610 {
3611 if (isnan(y)) return x;
3612
3613 return x <= y ? x : y;
3614 }
3615
reference_hypotl(long double x,long double y)3616 long double reference_hypotl(long double x, long double y)
3617 {
3618 static const double tobig = HEX_DBL(+, 1, 0, +, 511);
3619 static const double big = HEX_DBL(+, 1, 0, +, 513);
3620 static const double rbig = HEX_DBL(+, 1, 0, -, 513);
3621 static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
3622 static const double smalll = HEX_DBL(+, 1, 0, -, 607);
3623 static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
3624
3625 long double max, min;
3626
3627 if (isinf(x) || isinf(y)) return INFINITY;
3628
3629 if (isnan(x) || isnan(y)) return x + y;
3630
3631 x = reference_fabsl(x);
3632 y = reference_fabsl(y);
3633
3634 max = reference_fmaxl(x, y);
3635 min = reference_fminl(x, y);
3636
3637 if (max > tobig)
3638 {
3639 max *= rbig;
3640 min *= rbig;
3641 return big * sqrtl(max * max + min * min);
3642 }
3643
3644 if (max < tosmall)
3645 {
3646 max *= rsmall;
3647 min *= rsmall;
3648 return smalll * sqrtl(max * max + min * min);
3649 }
3650 return sqrtl(x * x + y * y);
3651 }
3652
reference_log2l(long double x)3653 long double reference_log2l(long double x)
3654 {
3655 if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
3656
3657 if (x == 0.0f) return -INFINITY;
3658
3659 if (x == INFINITY) return INFINITY;
3660
3661 double hi, lo;
3662 __log2_ep(&hi, &lo, x);
3663
3664 return (long double)hi + (long double)lo;
3665 }
3666
reference_log1pl(long double x)3667 long double reference_log1pl(long double x)
3668 {
3669 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3670 // unimplemented
3671 return x;
3672 #elif defined(__PPC__)
3673 // log1pl on PPC inadvertantly returns NaN for very large values. Work
3674 // around this limitation by returning logl for large values.
3675 return ((x > (long double)(0x1.0p+1022)) ? logl(x) : log1pl(x));
3676 #else
3677 return log1pl(x);
3678 #endif
3679 }
3680
reference_logbl(long double x)3681 long double reference_logbl(long double x)
3682 {
3683 // Since we are just using this to verify double precision, we can
3684 // use the double precision copysign here
3685 union {
3686 double f;
3687 cl_ulong u;
3688 } u;
3689 u.f = (double)x;
3690
3691 cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
3692 if (exponent == 0x7ff) return x * x;
3693
3694 if (exponent == 0)
3695 { // deal with denormals
3696 u.f = x * HEX_DBL(+, 1, 0, +, 64);
3697 exponent = (cl_int)(u.u >> 52) & 0x7ff;
3698 if (exponent == 0) return -INFINITY;
3699
3700 return exponent - (1023 + 64);
3701 }
3702
3703 return exponent - 1023;
3704 }
3705
reference_maxmagl(long double x,long double y)3706 long double reference_maxmagl(long double x, long double y)
3707 {
3708 long double fabsx = fabsl(x);
3709 long double fabsy = fabsl(y);
3710
3711 if (fabsx < fabsy) return y;
3712
3713 if (fabsy < fabsx) return x;
3714
3715 return reference_fmaxl(x, y);
3716 }
3717
reference_minmagl(long double x,long double y)3718 long double reference_minmagl(long double x, long double y)
3719 {
3720 long double fabsx = fabsl(x);
3721 long double fabsy = fabsl(y);
3722
3723 if (fabsx > fabsy) return y;
3724
3725 if (fabsy > fabsx) return x;
3726
3727 return reference_fminl(x, y);
3728 }
3729
reference_nanl(cl_ulong x)3730 long double reference_nanl(cl_ulong x)
3731 {
3732 union {
3733 cl_ulong u;
3734 cl_double f;
3735 } u;
3736 u.u = x | 0x7ff8000000000000ULL;
3737 return (long double)u.f;
3738 }
3739
3740
reference_reciprocall(long double x)3741 long double reference_reciprocall(long double x) { return 1.0L / x; }
3742
reference_remainderl(long double x,long double y)3743 long double reference_remainderl(long double x, long double y)
3744 {
3745 int i;
3746 return reference_remquol(x, y, &i);
3747 }
3748
reference_lgammal(long double x)3749 long double reference_lgammal(long double x)
3750 {
3751 // lgamma is currently not tested
3752 return reference_lgamma(x);
3753 }
3754
3755 static uint32_t two_over_pi[] = {
3756 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
3757 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, 0x6e3a424, 0x374b8019,
3758 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121,
3759 0xe99c702, 0x1ad17df9, 0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4,
3760 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9,
3761 0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
3762 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
3763 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
3764 0xdffd880, 0x135cc9cc, 0x10606155
3765 };
3766
3767 static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846,
3768 0x26263314, 0x1701b839, 0x28948127 };
3769
3770 typedef union {
3771 uint64_t u;
3772 double d;
3773 } d_ui64_t;
3774
3775 // radix or base of representation
3776 #define RADIX (30)
3777 #define DIGITS 6
3778
3779 d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
3780 d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
3781 d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
3782
3783 #define tp_pradix two_pow_pradix.d
3784 #define tp_mradix two_pow_mradix.d
3785
3786 // extended fixed point representation of double precision
3787 // floating point number.
3788 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
3789 typedef struct
3790 {
3791 uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
3792 // base_30
3793 int index; // exponent bias
3794 int sign; // sign of double
3795 } eprep_t;
3796
double_to_eprep(double x)3797 static eprep_t double_to_eprep(double x)
3798 {
3799 eprep_t result;
3800
3801 result.sign = (signbit(x) == 0) ? 1 : -1;
3802 x = fabs(x);
3803
3804 int index = 0;
3805 while (x > tp_pradix)
3806 {
3807 index++;
3808 x *= tp_mradix;
3809 }
3810 while (x < 1)
3811 {
3812 index--;
3813 x *= tp_pradix;
3814 }
3815
3816 result.index = index;
3817 int i = 0;
3818 result.X[0] = result.X[1] = result.X[2] = 0;
3819 while (x != 0.0)
3820 {
3821 result.X[i] = (uint32_t)x;
3822 x = (x - (double)result.X[i]) * tp_pradix;
3823 i++;
3824 }
3825 return result;
3826 }
3827
eprep_to_double(eprep_t epx)3828 static double eprep_to_double(eprep_t epx)
3829 {
3830 double res = 0.0;
3831
3832 res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
3833 res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
3834 res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
3835
3836 return copysign(res, epx.sign);
3837 }
3838
payne_hanek(double * y,int * exception)3839 static int payne_hanek(double *y, int *exception)
3840 {
3841 double x = *y;
3842
3843 // exception cases .. no reduction required
3844 if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
3845 {
3846 *exception = 1;
3847 return 0;
3848 }
3849
3850 *exception = 0;
3851
3852 // After computation result[0] contains integer part while
3853 // result[1]....result[DIGITS-1] contain fractional part. So we are doing
3854 // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
3855 // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
3856 // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
3857 // which can cause 61 digits of cancellation in computation of f = x*2/pi -
3858 // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
3859 // bits of mentissa of f) of precision to accurately compute f in double
3860 // precision. Since we are using 150 bits (still an overkill), we should be
3861 // safe. Extra bits can act as guard bits for correct rounding.
3862 uint64_t result[DIGITS + 2];
3863
3864 // compute extended precision representation of x
3865 eprep_t epx = double_to_eprep(x);
3866 int index = epx.index;
3867 int i, j;
3868 // extended precision multiplication of 2/pi*x .... we will loose at max two
3869 // RADIX=30 bit digits in the worst case
3870 for (i = 0; i < (DIGITS + 2); i++)
3871 {
3872 result[i] = 0;
3873 result[i] += ((index + i - 0) >= 0)
3874 ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
3875 : 0;
3876 result[i] += ((index + i - 1) >= 0)
3877 ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
3878 : 0;
3879 result[i] += ((index + i - 2) >= 0)
3880 ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
3881 : 0;
3882 }
3883
3884 // Carry propagation.
3885 uint64_t tmp;
3886 for (i = DIGITS + 2 - 1; i > 0; i--)
3887 {
3888 tmp = result[i] >> RADIX;
3889 result[i - 1] += tmp;
3890 result[i] -= (tmp << RADIX);
3891 }
3892
3893 // we dont ned to normalize the integer part since only last two bits of
3894 // this will be used subsequently algorithm which remain unaltered by this
3895 // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
3896 unsigned int N = (unsigned int)result[0];
3897
3898 // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
3899 // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
3900 // testing, at this stage, if r[1] (the first fractional digit) is greater
3901 // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
3902 // mentioned range is equivalent to substracting fractional part at this
3903 // stage from one and changing the sign.
3904 int sign = 1;
3905 if (result[1] > (uint64_t)(1 << (RADIX - 1)))
3906 {
3907 for (i = 1; i < (DIGITS + 2); i++)
3908 result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
3909 N += 1;
3910 sign = -1;
3911 }
3912
3913 // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
3914 // the worst case (when x is multiple of 2/pi very close to an integer) so
3915 // we need to get rid of these zeros and adjust the index of final result.
3916 // So in the worst case, precision of comupted result is 90 bits (150 bits
3917 // original bits - 60 lost in cancellation).
3918 int ind = 1;
3919 for (i = 1; i < (DIGITS + 2); i++)
3920 {
3921 if (result[i] != 0)
3922 break;
3923 else
3924 ind++;
3925 }
3926
3927 uint64_t r[DIGITS - 1];
3928 for (i = 0; i < (DIGITS - 1); i++)
3929 {
3930 r[i] = 0;
3931 for (j = 0; j <= i; j++)
3932 {
3933 r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
3934 }
3935 }
3936 for (i = (DIGITS - 2); i > 0; i--)
3937 {
3938 tmp = r[i] >> RADIX;
3939 r[i - 1] += tmp;
3940 r[i] -= (tmp << RADIX);
3941 }
3942 tmp = r[0] >> RADIX;
3943 r[0] -= (tmp << RADIX);
3944
3945 eprep_t epr;
3946 epr.sign = epx.sign * sign;
3947 if (tmp != 0)
3948 {
3949 epr.index = -ind + 1;
3950 epr.X[0] = (uint32_t)tmp;
3951 epr.X[1] = (uint32_t)r[0];
3952 epr.X[2] = (uint32_t)r[1];
3953 }
3954 else
3955 {
3956 epr.index = -ind;
3957 epr.X[0] = (uint32_t)r[0];
3958 epr.X[1] = (uint32_t)r[1];
3959 epr.X[2] = (uint32_t)r[2];
3960 }
3961
3962 *y = eprep_to_double(epr);
3963 return epx.sign * N;
3964 }
3965
reference_relaxed_cos(double x)3966 double reference_relaxed_cos(double x)
3967 {
3968 if (isnan(x)) return NAN;
3969 return (float)cos((float)x);
3970 }
3971
reference_cos(double x)3972 double reference_cos(double x)
3973 {
3974 int exception;
3975 int N = payne_hanek(&x, &exception);
3976 if (exception) return cos(x);
3977 unsigned int c = N & 3;
3978 switch (c)
3979 {
3980 case 0: return cos(x);
3981 case 1: return -sin(x);
3982 case 2: return -cos(x);
3983 case 3: return sin(x);
3984 }
3985 return 0.0;
3986 }
3987
reference_relaxed_sin(double x)3988 double reference_relaxed_sin(double x) { return (float)sin((float)x); }
3989
reference_sin(double x)3990 double reference_sin(double x)
3991 {
3992 int exception;
3993 int N = payne_hanek(&x, &exception);
3994 if (exception) return sin(x);
3995 int c = N & 3;
3996 switch (c)
3997 {
3998 case 0: return sin(x);
3999 case 1: return cos(x);
4000 case 2: return -sin(x);
4001 case 3: return -cos(x);
4002 }
4003 return 0.0;
4004 }
4005
reference_relaxed_sincos(double x,double * y)4006 double reference_relaxed_sincos(double x, double *y)
4007 {
4008 *y = reference_relaxed_cos(x);
4009 return reference_relaxed_sin(x);
4010 }
4011
reference_sincos(double x,double * y)4012 double reference_sincos(double x, double *y)
4013 {
4014 int exception;
4015 int N = payne_hanek(&x, &exception);
4016 if (exception)
4017 {
4018 *y = cos(x);
4019 return sin(x);
4020 }
4021 int c = N & 3;
4022 switch (c)
4023 {
4024 case 0: *y = cos(x); return sin(x);
4025 case 1: *y = -sin(x); return cos(x);
4026 case 2: *y = -cos(x); return -sin(x);
4027 case 3: *y = sin(x); return -cos(x);
4028 }
4029 return 0.0;
4030 }
4031
reference_relaxed_tan(double x)4032 double reference_relaxed_tan(double x)
4033 {
4034 return ((float)reference_relaxed_sin((float)x))
4035 / ((float)reference_relaxed_cos((float)x));
4036 }
4037
reference_tan(double x)4038 double reference_tan(double x)
4039 {
4040 int exception;
4041 int N = payne_hanek(&x, &exception);
4042 if (exception) return tan(x);
4043 int c = N & 3;
4044 switch (c)
4045 {
4046 case 0: return tan(x);
4047 case 1: return -1.0 / tan(x);
4048 case 2: return tan(x);
4049 case 3: return -1.0 / tan(x);
4050 }
4051 return 0.0;
4052 }
4053
reference_cosl(long double xx)4054 long double reference_cosl(long double xx)
4055 {
4056 double x = (double)xx;
4057 int exception;
4058 int N = payne_hanek(&x, &exception);
4059 if (exception) return cosl(x);
4060 unsigned int c = N & 3;
4061 switch (c)
4062 {
4063 case 0: return cosl(x);
4064 case 1: return -sinl(x);
4065 case 2: return -cosl(x);
4066 case 3: return sinl(x);
4067 }
4068 return 0.0;
4069 }
4070
reference_sinl(long double xx)4071 long double reference_sinl(long double xx)
4072 {
4073 // we use system tanl after reduction which
4074 // can flush denorm input to zero so
4075 // take care of it here.
4076 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4077
4078 double x = (double)xx;
4079 int exception;
4080 int N = payne_hanek(&x, &exception);
4081 if (exception) return sinl(x);
4082 int c = N & 3;
4083 switch (c)
4084 {
4085 case 0: return sinl(x);
4086 case 1: return cosl(x);
4087 case 2: return -sinl(x);
4088 case 3: return -cosl(x);
4089 }
4090 return 0.0;
4091 }
4092
reference_sincosl(long double xx,long double * y)4093 long double reference_sincosl(long double xx, long double *y)
4094 {
4095 // we use system tanl after reduction which
4096 // can flush denorm input to zero so
4097 // take care of it here.
4098 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
4099 {
4100 *y = cosl(xx);
4101 return xx;
4102 }
4103
4104 double x = (double)xx;
4105 int exception;
4106 int N = payne_hanek(&x, &exception);
4107 if (exception)
4108 {
4109 *y = cosl(x);
4110 return sinl(x);
4111 }
4112 int c = N & 3;
4113 switch (c)
4114 {
4115 case 0: *y = cosl(x); return sinl(x);
4116 case 1: *y = -sinl(x); return cosl(x);
4117 case 2: *y = -cosl(x); return -sinl(x);
4118 case 3: *y = sinl(x); return -cosl(x);
4119 }
4120 return 0.0;
4121 }
4122
reference_tanl(long double xx)4123 long double reference_tanl(long double xx)
4124 {
4125 // we use system tanl after reduction which
4126 // can flush denorm input to zero so
4127 // take care of it here.
4128 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4129
4130 double x = (double)xx;
4131 int exception;
4132 int N = payne_hanek(&x, &exception);
4133 if (exception) return tanl(x);
4134 int c = N & 3;
4135 switch (c)
4136 {
4137 case 0: return tanl(x);
4138 case 1: return -1.0 / tanl(x);
4139 case 2: return tanl(x);
4140 case 3: return -1.0 / tanl(x);
4141 }
4142 return 0.0;
4143 }
4144
4145 static double __loglTable1[64][3] = {
4146 { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
4147 HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
4148 { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
4149 HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
4150 { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
4151 HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
4152 { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
4153 HEX_DBL(+, 1, ceac7f0607711, -, 43) },
4154 { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
4155 HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
4156 { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
4157 HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
4158 { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
4159 HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
4160 { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
4161 HEX_DBL(+, 1, 8246f8e527754, -, 40) },
4162 { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
4163 HEX_DBL(+, 1, e1513c28e180d, -, 42) },
4164 { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
4165 HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
4166 { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
4167 HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
4168 { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
4169 HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
4170 { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
4171 HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
4172 { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
4173 HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
4174 { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
4175 HEX_DBL(+, 1, 597f8a121640f, -, 40) },
4176 { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
4177 HEX_DBL(+, 1, 02807d15580dc, -, 40) },
4178 { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
4179 HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
4180 { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
4181 HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
4182 { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
4183 HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
4184 { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
4185 HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
4186 { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
4187 HEX_DBL(+, 1, 2c83506452154, -, 42) },
4188 { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
4189 HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
4190 { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
4191 HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
4192 { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
4193 HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
4194 { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
4195 HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
4196 { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
4197 HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
4198 { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
4199 HEX_DBL(+, 1, 796f1632949c3, -, 40) },
4200 { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
4201 HEX_DBL(+, 1, 1687e151172cc, -, 40) },
4202 { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
4203 HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
4204 { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
4205 HEX_DBL(+, 1, b183a6b628487, -, 40) },
4206 { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
4207 HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
4208 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4209 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4210 { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
4211 HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
4212 { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
4213 HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
4214 { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
4215 HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
4216 { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
4217 HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
4218 { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
4219 HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
4220 { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
4221 HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
4222 { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
4223 HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
4224 { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
4225 HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
4226 { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
4227 HEX_DBL(+, 1, c383cd11362f4, -, 41) },
4228 { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
4229 HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
4230 { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
4231 HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
4232 { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
4233 HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
4234 { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
4235 HEX_DBL(+, 1, c1c161471580a, -, 40) },
4236 { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
4237 HEX_DBL(+, 1, ab1aa62214581, -, 42) },
4238 { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
4239 HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
4240 { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
4241 HEX_DBL(+, 1, 6b53fee511af, -, 42) },
4242 { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
4243 HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
4244 { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
4245 HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
4246 { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
4247 HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
4248 { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
4249 HEX_DBL(+, 1, 197e845877c94, -, 41) },
4250 { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
4251 HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
4252 { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
4253 HEX_DBL(+, 1, 031f4336644cc, -, 42) },
4254 { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
4255 HEX_DBL(+, 1, 61f33c897020c, -, 40) },
4256 { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
4257 HEX_DBL(+, 1, b2632e830632, -, 41) },
4258 { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
4259 HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
4260 { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
4261 HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
4262 { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
4263 HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
4264 { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
4265 HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
4266 { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
4267 HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
4268 { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
4269 HEX_DBL(+, 1, 26faeb9870945, -, 45) },
4270 { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
4271 HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
4272 };
4273
4274 static double __loglTable2[64][3] = {
4275 { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
4276 HEX_DBL(+, 1, 0725e5755e314, -, 60) },
4277 { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
4278 HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
4279 { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
4280 HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
4281 { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
4282 HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
4283 { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
4284 HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
4285 { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
4286 HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
4287 { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
4288 HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
4289 { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
4290 HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
4291 { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
4292 HEX_DBL(+, 1, 63d795746f216, -, 60) },
4293 { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
4294 HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
4295 { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
4296 HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
4297 { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
4298 HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
4299 { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
4300 HEX_DBL(+, 1, c0da537429cea, -, 61) },
4301 { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
4302 HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
4303 { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
4304 HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
4305 { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
4306 HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
4307 { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
4308 HEX_DBL(+, 1, ca230e0bea645, -, 61) },
4309 { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
4310 HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
4311 { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
4312 HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
4313 { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
4314 HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
4315 { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
4316 HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
4317 { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
4318 HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
4319 { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
4320 HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
4321 { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
4322 HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
4323 { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
4324 HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
4325 { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
4326 HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
4327 { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
4328 HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
4329 { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
4330 HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
4331 { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
4332 HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
4333 { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
4334 HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
4335 { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
4336 HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
4337 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4338 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4339 { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
4340 HEX_DBL(+, 1, 95728136ae401, -, 63) },
4341 { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
4342 HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
4343 { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
4344 HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
4345 { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
4346 HEX_DBL(+, 1, daec734b11615, -, 63) },
4347 { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
4348 HEX_DBL(+, 1, 8bafca62941da, -, 62) },
4349 { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
4350 HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
4351 { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
4352 HEX_DBL(+, 1, dafce1f09937b, -, 61) },
4353 { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
4354 HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
4355 { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
4356 HEX_DBL(+, 1, 645a769232446, -, 62) },
4357 { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
4358 HEX_DBL(+, 1, 882320d368331, -, 61) },
4359 { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
4360 HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
4361 { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
4362 HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
4363 { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
4364 HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
4365 { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
4366 HEX_DBL(+, 1, c75e63370988b, -, 61) },
4367 { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
4368 HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
4369 { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
4370 HEX_DBL(+, 1, b3df952cc473c, -, 61) },
4371 { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
4372 HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
4373 { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
4374 HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
4375 { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
4376 HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
4377 { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
4378 HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
4379 { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
4380 HEX_DBL(+, 1, 21e24fc245697, -, 62) },
4381 { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
4382 HEX_DBL(+, 1, 12ef51b967102, -, 60) },
4383 { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
4384 HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
4385 { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
4386 HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
4387 { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
4388 HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
4389 { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
4390 HEX_DBL(+, 1, 70106136a8919, -, 60) },
4391 { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
4392 HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
4393 { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
4394 HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
4395 { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
4396 HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
4397 { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
4398 HEX_DBL(+, 1, c5b190c04606e, -, 62) },
4399 { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
4400 HEX_DBL(+, 1, 4723441195ac9, -, 59) }
4401 };
4402
4403 static double __loglTable3[8][3] = {
4404 { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
4405 HEX_DBL(+, 1, a1003588d217a, -, 65) },
4406 { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
4407 HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
4408 { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
4409 HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
4410 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4411 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4412 { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
4413 HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
4414 { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
4415 HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
4416 { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
4417 HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
4418 };
4419
__log2_ep(double * hi,double * lo,double x)4420 static void __log2_ep(double *hi, double *lo, double x)
4421 {
4422 union {
4423 uint64_t i;
4424 double d;
4425 } uu;
4426
4427 int m;
4428 double f = reference_frexp(x, &m);
4429
4430 // bring f in [0.75, 1.5)
4431 if (f < 0.75)
4432 {
4433 f *= 2.0;
4434 m -= 1;
4435 }
4436
4437 // index first table .... brings down to [1-2^-7, 1+2^6)
4438 uu.d = f;
4439 int index =
4440 (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
4441 double r1 = __loglTable1[index][0];
4442 double logr1hi = __loglTable1[index][1];
4443 double logr1lo = __loglTable1[index][2];
4444 // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
4445 // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
4446 // without spilling
4447 logr1hi += m;
4448
4449 // argument reduction needs to be in double-double since reduced argument
4450 // will form the leading term of polynomial approximation which sets the
4451 // precision we eventually achieve
4452 double zhi, zlo;
4453 MulD(&zhi, &zlo, r1, uu.d);
4454
4455 // second index table .... brings down to [1-2^-12, 1+2^-11)
4456 uu.d = zhi;
4457 index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
4458 double r2 = __loglTable2[index][0];
4459 double logr2hi = __loglTable2[index][1];
4460 double logr2lo = __loglTable2[index][2];
4461
4462 // reduce argument
4463 MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0);
4464
4465 // third index table .... brings down to [1-2^-14, 1+2^-13)
4466 // Actually reduction to 2^-11 would have been sufficient to calculate
4467 // second order term in polynomial in double rather than double-double, I
4468 // reduced it a bit more to make sure other systematic arithmetic errors
4469 // are guarded against .... also this allow lower order product of leading
4470 // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
4471 // than double-double ... hence only term that needs to be done in
4472 // double-double is Ao_hi*z_hi
4473 uu.d = zhi;
4474 index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
4475 double r3 = __loglTable3[index][0];
4476 double logr3hi = __loglTable3[index][1];
4477 double logr3lo = __loglTable3[index][2];
4478
4479 // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo))
4480 // calculate sum of first three terms ... note that m has already
4481 // been added to log2(r1)_hi
4482 double log2hi, log2lo;
4483 AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
4484 AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
4485
4486 // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
4487 // this
4488 MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
4489 // we dont need to do full double-double substract here. substracting 1.0
4490 // for higher term is exact
4491 zhi = zhi - 1.0;
4492 // normalize
4493 AddD(&zhi, &zlo, zhi, zlo);
4494
4495 // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
4496 // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
4497 // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
4498 // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
4499 // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
4500 // A4*z^3) since we are looking for at least 64 digits of precision and z in
4501 // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
4502 // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
4503 // double-double
4504
4505 double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
4506 double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
4507 double y;
4508 y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
4509 y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
4510 y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
4511 y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
4512 double zhisq = zhi * zhi;
4513 y = y * zhisq;
4514 y = y + zhi * Aolo;
4515 y = y + zlo * Aohi;
4516
4517 MulD(&zhi, &zlo, Aohi, zhi);
4518 AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
4519 AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo);
4520
4521 *hi = zhi;
4522 *lo = zlo;
4523 }
4524
reference_powl(long double x,long double y)4525 long double reference_powl(long double x, long double y)
4526 {
4527 // this will be used for testing doubles i.e. arguments will
4528 // be doubles so cast the input back to double ... returned
4529 // result will be long double though .... > 53 bits of precision
4530 // if platform allows.
4531 // ===========
4532 // New finding.
4533 // ===========
4534 // this function is getting used for computing reference cube root (cbrt)
4535 // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to
4536 // be double and is converted from long double to double, truncation
4537 // causes errors. So we need to tread y as long double and convert it
4538 // to hi, lo doubles when performing y*log2(x).
4539
4540 static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
4541
4542 // if x = 1, return x for any y, even NaN
4543 if (x == 1.0) return x;
4544
4545 // if y == 0, return 1 for any x, even NaN
4546 if (y == 0.0) return 1.0L;
4547
4548 // get NaNs out of the way
4549 if (x != x || y != y) return x + y;
4550
4551 // do the work required to sort out edge cases
4552 double fabsy = reference_fabs(y);
4553 double fabsx = reference_fabs(x);
4554 double iy = reference_rint(
4555 fabsy); // we do round to nearest here so that |fy| <= 0.5
4556 if (iy > fabsy) // convert nearbyint to floor
4557 iy -= 1.0;
4558 int isOddInt = 0;
4559 if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
4560 isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
4561
4562 /// test a few more edge cases
4563 // deal with x == 0 cases
4564 if (x == 0.0)
4565 {
4566 if (!isOddInt) x = 0.0;
4567
4568 if (y < 0) x = 1.0 / x;
4569
4570 return x;
4571 }
4572
4573 // x == +-Inf cases
4574 if (isinf(fabsx))
4575 {
4576 if (x < 0)
4577 {
4578 if (isOddInt)
4579 {
4580 if (y < 0)
4581 return -0.0;
4582 else
4583 return -INFINITY;
4584 }
4585 else
4586 {
4587 if (y < 0)
4588 return 0.0;
4589 else
4590 return INFINITY;
4591 }
4592 }
4593
4594 if (y < 0) return 0;
4595 return INFINITY;
4596 }
4597
4598 // y = +-inf cases
4599 if (isinf(fabsy))
4600 {
4601 if (x == -1) return 1;
4602
4603 if (y < 0)
4604 {
4605 if (fabsx < 1) return INFINITY;
4606 return 0;
4607 }
4608 if (fabsx < 1) return 0;
4609 return INFINITY;
4610 }
4611
4612 // x < 0 and y non integer case
4613 if (x < 0 && iy != fabsy)
4614 {
4615 // return nan;
4616 return cl_make_nan();
4617 }
4618
4619 // speedy resolution of sqrt and reciprocal sqrt
4620 if (fabsy == 0.5)
4621 {
4622 long double xl = sqrtl(x);
4623 if (y < 0) xl = 1.0 / xl;
4624 return xl;
4625 }
4626
4627 double log2x_hi, log2x_lo;
4628
4629 // extended precision log .... accurate to at least 64-bits + couple of
4630 // guard bits
4631 __log2_ep(&log2x_hi, &log2x_lo, fabsx);
4632
4633 double ylog2x_hi, ylog2x_lo;
4634
4635 double y_hi = (double)y;
4636 double y_lo = (double)(y - (long double)y_hi);
4637
4638 // compute product of y*log2(x)
4639 // scale to avoid overflow in double-double multiplication
4640 if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970))
4641 {
4642 y_hi = reference_ldexp(y_hi, -53);
4643 y_lo = reference_ldexp(y_lo, -53);
4644 }
4645 MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
4646 if (fabs(y) > HEX_DBL(+, 1, 0, +, 970))
4647 {
4648 ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
4649 ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
4650 }
4651
4652 long double powxy;
4653 if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
4654 {
4655 powxy =
4656 reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
4657 }
4658 else
4659 {
4660 // separate integer + fractional part
4661 long int m = lrint(ylog2x_hi);
4662 AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
4663
4664 // revert to long double arithemtic
4665 long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
4666 long double tmp = reference_exp2l(ylog2x);
4667 powxy = reference_scalblnl(tmp, m);
4668 }
4669
4670 // if y is odd integer and x is negative, reverse sign
4671 if (isOddInt & reference_signbit(x)) powxy = -powxy;
4672 return powxy;
4673 }
4674
reference_nextafter(double xx,double yy)4675 double reference_nextafter(double xx, double yy)
4676 {
4677 float x = (float)xx;
4678 float y = (float)yy;
4679
4680 // take care of nans
4681 if (x != x) return x;
4682
4683 if (y != y) return y;
4684
4685 if (x == y) return y;
4686
4687 int32f_t a, b;
4688
4689 a.f = x;
4690 b.f = y;
4691
4692 if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
4693 if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
4694
4695 a.i += (a.i < b.i) ? 1 : -1;
4696 a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
4697
4698 return a.f;
4699 }
4700
4701
reference_nextafterl(long double xx,long double yy)4702 long double reference_nextafterl(long double xx, long double yy)
4703 {
4704 double x = (double)xx;
4705 double y = (double)yy;
4706
4707 // take care of nans
4708 if (x != x) return x;
4709
4710 if (y != y) return y;
4711
4712 int64d_t a, b;
4713
4714 a.d = x;
4715 b.d = y;
4716
4717 int64_t tmp = 0x8000000000000000LL;
4718
4719 if (a.l & tmp) a.l = tmp - a.l;
4720 if (b.l & tmp) b.l = tmp - b.l;
4721
4722 // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
4723 // 0.0f) test needs to be done using integer rep because subnormals may be
4724 // flushed to zero on some platforms
4725 if (a.l == b.l) return y;
4726
4727 a.l += (a.l < b.l) ? 1 : -1;
4728 a.l = (a.l < 0) ? tmp - a.l : a.l;
4729
4730 return a.d;
4731 }
4732
reference_fdim(double xx,double yy)4733 double reference_fdim(double xx, double yy)
4734 {
4735 float x = (float)xx;
4736 float y = (float)yy;
4737
4738 if (x != x) return x;
4739
4740 if (y != y) return y;
4741
4742 float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
4743 return r;
4744 }
4745
4746
reference_fdiml(long double xx,long double yy)4747 long double reference_fdiml(long double xx, long double yy)
4748 {
4749 double x = (double)xx;
4750 double y = (double)yy;
4751
4752 if (x != x) return x;
4753
4754 if (y != y) return y;
4755
4756 double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
4757 return r;
4758 }
4759
reference_remquo(double xd,double yd,int * n)4760 double reference_remquo(double xd, double yd, int *n)
4761 {
4762 float xx = (float)xd;
4763 float yy = (float)yd;
4764
4765 if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
4766 {
4767 *n = 0;
4768 return cl_make_nan();
4769 }
4770
4771 if (fabsf(yy) == INFINITY || xx == 0.0f)
4772 {
4773 *n = 0;
4774 return xd;
4775 }
4776
4777 if (fabsf(xx) == fabsf(yy))
4778 {
4779 *n = (xx == yy) ? 1 : -1;
4780 return reference_signbit(xx) ? -0.0 : 0.0;
4781 }
4782
4783 int signx = reference_signbit(xx) ? -1 : 1;
4784 int signy = reference_signbit(yy) ? -1 : 1;
4785 int signn = (signx == signy) ? 1 : -1;
4786 float x = fabsf(xx);
4787 float y = fabsf(yy);
4788
4789 int ex, ey;
4790 ex = reference_ilogb(x);
4791 ey = reference_ilogb(y);
4792 float xr = x;
4793 float yr = y;
4794 uint32_t q = 0;
4795
4796 if (ex - ey >= -1)
4797 {
4798
4799 yr = (float)reference_ldexp(y, -ey);
4800 xr = (float)reference_ldexp(x, -ex);
4801
4802 if (ex - ey >= 0)
4803 {
4804 int i;
4805 for (i = ex - ey; i > 0; i--)
4806 {
4807 q <<= 1;
4808 if (xr >= yr)
4809 {
4810 xr -= yr;
4811 q += 1;
4812 }
4813 xr += xr;
4814 }
4815 q <<= 1;
4816 if (xr > yr)
4817 {
4818 xr -= yr;
4819 q += 1;
4820 }
4821 }
4822 else // ex-ey = -1
4823 xr = reference_ldexp(xr, ex - ey);
4824 }
4825
4826 if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
4827 {
4828 xr -= yr;
4829 q += 1;
4830 }
4831
4832 if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4833
4834 int qout = q & 0x0000007f;
4835 if (signn < 0) qout = -qout;
4836 if (xx < 0.0) xr = -xr;
4837
4838 *n = qout;
4839
4840 return xr;
4841 }
4842
reference_remquol(long double xd,long double yd,int * n)4843 long double reference_remquol(long double xd, long double yd, int *n)
4844 {
4845 double xx = (double)xd;
4846 double yy = (double)yd;
4847
4848 if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
4849 {
4850 *n = 0;
4851 return cl_make_nan();
4852 }
4853
4854 if (reference_fabs(yy) == INFINITY || xx == 0.0)
4855 {
4856 *n = 0;
4857 return xd;
4858 }
4859
4860 if (reference_fabs(xx) == reference_fabs(yy))
4861 {
4862 *n = (xx == yy) ? 1 : -1;
4863 return reference_signbit(xx) ? -0.0 : 0.0;
4864 }
4865
4866 int signx = reference_signbit(xx) ? -1 : 1;
4867 int signy = reference_signbit(yy) ? -1 : 1;
4868 int signn = (signx == signy) ? 1 : -1;
4869 double x = reference_fabs(xx);
4870 double y = reference_fabs(yy);
4871
4872 int ex, ey;
4873 ex = reference_ilogbl(x);
4874 ey = reference_ilogbl(y);
4875 double xr = x;
4876 double yr = y;
4877 uint32_t q = 0;
4878
4879 if (ex - ey >= -1)
4880 {
4881 yr = reference_ldexp(y, -ey);
4882 xr = reference_ldexp(x, -ex);
4883 int i;
4884
4885 if (ex - ey >= 0)
4886 {
4887 for (i = ex - ey; i > 0; i--)
4888 {
4889 q <<= 1;
4890 if (xr >= yr)
4891 {
4892 xr -= yr;
4893 q += 1;
4894 }
4895 xr += xr;
4896 }
4897 q <<= 1;
4898 if (xr > yr)
4899 {
4900 xr -= yr;
4901 q += 1;
4902 }
4903 }
4904 else
4905 xr = reference_ldexp(xr, ex - ey);
4906 }
4907
4908 if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
4909 {
4910 xr -= yr;
4911 q += 1;
4912 }
4913
4914 if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4915
4916 int qout = q & 0x0000007f;
4917 if (signn < 0) qout = -qout;
4918 if (xx < 0.0) xr = -xr;
4919
4920 *n = qout;
4921 return xr;
4922 }
4923
reference_scalbn(double x,int n)4924 static double reference_scalbn(double x, int n)
4925 {
4926 if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
4927
4928 int bias = 1023;
4929 union {
4930 double d;
4931 cl_long l;
4932 } u;
4933 u.d = (double)x;
4934 int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
4935 if (e == 0)
4936 {
4937 u.l |= ((cl_long)1023 << 52);
4938 u.d -= 1.0;
4939 e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
4940 }
4941 e += n;
4942 if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
4943 if (e < -51 || n < -2097) return reference_copysign(0.0, x);
4944 if (e <= 0)
4945 {
4946 bias += (e - 1);
4947 e = 1;
4948 }
4949 u.l &= 0x800fffffffffffffLL;
4950 u.l |= ((cl_long)e << 52);
4951 x = u.d;
4952 u.l = ((cl_long)bias << 52);
4953 return x * u.d;
4954 }
4955
reference_scalblnl(long double x,long n)4956 static long double reference_scalblnl(long double x, long n)
4957 {
4958 #if defined(__i386__) || defined(__x86_64__) // INTEL
4959 union {
4960 long double d;
4961 struct
4962 {
4963 cl_ulong m;
4964 cl_ushort sexp;
4965 } u;
4966 } u;
4967 u.u.m = CL_LONG_MIN;
4968
4969 if (reference_isinf(x)) return x;
4970
4971 if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
4972
4973 if (n > 2200) return reference_copysignl(INFINITY, x);
4974
4975 if (n < 0)
4976 {
4977 u.u.sexp = 0x3fff - 1022;
4978 while (n <= -1022)
4979 {
4980 x *= u.d;
4981 n += 1022;
4982 }
4983 u.u.sexp = 0x3fff + n;
4984 x *= u.d;
4985 return x;
4986 }
4987
4988 if (n > 0)
4989 {
4990 u.u.sexp = 0x3fff + 1023;
4991 while (n >= 1023)
4992 {
4993 x *= u.d;
4994 n -= 1023;
4995 }
4996 u.u.sexp = 0x3fff + n;
4997 x *= u.d;
4998 return x;
4999 }
5000
5001 return x;
5002
5003 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
5004
5005 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
5006 if (reference_isinfl(x) || reference_isnanl(x)) return x;
5007
5008 int bias = 1023;
5009 union {
5010 double d;
5011 cl_long l;
5012 } u;
5013 u.d = (double)x;
5014 int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
5015 if (e == 0)
5016 {
5017 u.l |= ((cl_long)1023 << 52);
5018 u.d -= 1.0;
5019 e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
5020 }
5021 e += n;
5022 if (e >= 2047) return reference_copysignl(INFINITY, x);
5023 if (e < -51) return reference_copysignl(0.0, x);
5024 if (e <= 0)
5025 {
5026 bias += (e - 1);
5027 e = 1;
5028 }
5029 u.l &= 0x800fffffffffffffLL;
5030 u.l |= ((cl_long)e << 52);
5031 x = u.d;
5032 u.l = ((cl_long)bias << 52);
5033 return x * u.d;
5034 #endif
5035
5036 #else // PPC
5037 return scalblnl(x, n);
5038 #endif
5039 }
5040
reference_relaxed_exp(double x)5041 double reference_relaxed_exp(double x) { return reference_exp(x); }
5042
reference_exp(double x)5043 double reference_exp(double x)
5044 {
5045 return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
5046 }
5047
reference_expl(long double x)5048 long double reference_expl(long double x)
5049 {
5050 #if defined(__PPC__)
5051 long double scale, bias;
5052
5053 // The PPC double long version of expl fails to produce denorm results
5054 // and instead generates a 0.0. Compensate for this limitation by
5055 // computing expl as:
5056 // expl(x + 40) * expl(-40)
5057 // Likewise, overflows can prematurely produce an infinity, so we
5058 // compute expl as:
5059 // expl(x - 40) * expl(40)
5060 scale = 1.0L;
5061 bias = 0.0L;
5062 if (x < -708.0L)
5063 {
5064 bias = 40.0;
5065 scale = expl(-40.0L);
5066 }
5067 else if (x > 708.0L)
5068 {
5069 bias = -40.0L;
5070 scale = expl(40.0L);
5071 }
5072 return expl(x + bias) * scale;
5073 #else
5074 return expl(x);
5075 #endif
5076 }
5077
reference_sinh(double x)5078 double reference_sinh(double x) { return sinh(x); }
5079
reference_sinhl(long double x)5080 long double reference_sinhl(long double x) { return sinhl(x); }
5081
reference_fmod(double x,double y)5082 double reference_fmod(double x, double y)
5083 {
5084 if (x == 0.0 && fabs(y) > 0.0) return x;
5085
5086 if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
5087
5088 if (fabs(y) == INFINITY) // we know x is finite from above
5089 return x;
5090 #if defined(_MSC_VER) && defined(_M_X64)
5091 return fmod(x, y);
5092 #else
5093 return fmodf((float)x, (float)y);
5094 #endif
5095 }
5096
reference_fmodl(long double x,long double y)5097 long double reference_fmodl(long double x, long double y)
5098 {
5099 if (x == 0.0L && fabsl(y) > 0.0L) return x;
5100
5101 if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
5102
5103 if (fabsl(y) == INFINITY) // we know x is finite from above
5104 return x;
5105
5106 return fmod((double)x, (double)y);
5107 }
5108
reference_modf(double x,double * n)5109 double reference_modf(double x, double *n)
5110 {
5111 if (isnan(x))
5112 {
5113 *n = cl_make_nan();
5114 return cl_make_nan();
5115 }
5116 float nr;
5117 float yr = modff((float)x, &nr);
5118 *n = nr;
5119 return yr;
5120 }
5121
reference_modfl(long double x,long double * n)5122 long double reference_modfl(long double x, long double *n)
5123 {
5124 if (isnan(x))
5125 {
5126 *n = cl_make_nan();
5127 return cl_make_nan();
5128 }
5129 double nr;
5130 double yr = modf((double)x, &nr);
5131 *n = nr;
5132 return yr;
5133 }
5134
reference_fractl(long double x,long double * ip)5135 long double reference_fractl(long double x, long double *ip)
5136 {
5137 if (isnan(x))
5138 {
5139 *ip = cl_make_nan();
5140 return cl_make_nan();
5141 }
5142
5143 double i;
5144 double f = modf((double)x, &i);
5145 if (f < 0.0)
5146 {
5147 f = 1.0 + f;
5148 i -= 1.0;
5149 if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
5150 }
5151 *ip = i;
5152 return f;
5153 }
5154
reference_fabsl(long double x)5155 long double reference_fabsl(long double x) { return fabsl(x); }
5156
reference_relaxed_log(double x)5157 double reference_relaxed_log(double x)
5158 {
5159 return (float)reference_log((float)x);
5160 }
5161
reference_log(double x)5162 double reference_log(double x)
5163 {
5164 if (x == 0.0) return -INFINITY;
5165
5166 if (x < 0.0) return cl_make_nan();
5167
5168 if (isinf(x)) return INFINITY;
5169
5170 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5171 double logxHi, logxLo;
5172 __log2_ep(&logxHi, &logxLo, x);
5173 return logxHi * log2Hi;
5174 }
5175
reference_logl(long double x)5176 long double reference_logl(long double x)
5177 {
5178 if (x == 0.0) return -INFINITY;
5179
5180 if (x < 0.0) return cl_make_nan();
5181
5182 if (isinf(x)) return INFINITY;
5183
5184 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5185 double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
5186 double logxHi, logxLo;
5187 __log2_ep(&logxHi, &logxLo, x);
5188
5189 long double lg2 = (long double)log2Hi + (long double)log2Lo;
5190 long double logx = (long double)logxHi + (long double)logxLo;
5191 return logx * lg2;
5192 }
5193
reference_relaxed_pow(double x,double y)5194 double reference_relaxed_pow(double x, double y)
5195 {
5196 return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
5197 }
5198
reference_pow(double x,double y)5199 double reference_pow(double x, double y)
5200 {
5201 static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
5202
5203 // if x = 1, return x for any y, even NaN
5204 if (x == 1.0) return x;
5205
5206 // if y == 0, return 1 for any x, even NaN
5207 if (y == 0.0) return 1.0;
5208
5209 // get NaNs out of the way
5210 if (x != x || y != y) return x + y;
5211
5212 // do the work required to sort out edge cases
5213 double fabsy = reference_fabs(y);
5214 double fabsx = reference_fabs(x);
5215 double iy = reference_rint(
5216 fabsy); // we do round to nearest here so that |fy| <= 0.5
5217 if (iy > fabsy) // convert nearbyint to floor
5218 iy -= 1.0;
5219 int isOddInt = 0;
5220 if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
5221 isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
5222
5223 /// test a few more edge cases
5224 // deal with x == 0 cases
5225 if (x == 0.0)
5226 {
5227 if (!isOddInt) x = 0.0;
5228
5229 if (y < 0) x = 1.0 / x;
5230
5231 return x;
5232 }
5233
5234 // x == +-Inf cases
5235 if (isinf(fabsx))
5236 {
5237 if (x < 0)
5238 {
5239 if (isOddInt)
5240 {
5241 if (y < 0)
5242 return -0.0;
5243 else
5244 return -INFINITY;
5245 }
5246 else
5247 {
5248 if (y < 0)
5249 return 0.0;
5250 else
5251 return INFINITY;
5252 }
5253 }
5254
5255 if (y < 0) return 0;
5256 return INFINITY;
5257 }
5258
5259 // y = +-inf cases
5260 if (isinf(fabsy))
5261 {
5262 if (x == -1) return 1;
5263
5264 if (y < 0)
5265 {
5266 if (fabsx < 1) return INFINITY;
5267 return 0;
5268 }
5269 if (fabsx < 1) return 0;
5270 return INFINITY;
5271 }
5272
5273 // x < 0 and y non integer case
5274 if (x < 0 && iy != fabsy)
5275 {
5276 // return nan;
5277 return cl_make_nan();
5278 }
5279
5280 // speedy resolution of sqrt and reciprocal sqrt
5281 if (fabsy == 0.5)
5282 {
5283 long double xl = reference_sqrt(x);
5284 if (y < 0) xl = 1.0 / xl;
5285 return xl;
5286 }
5287
5288 double hi, lo;
5289 __log2_ep(&hi, &lo, fabsx);
5290 double prod = y * hi;
5291 double result = reference_exp2(prod);
5292 return isOddInt ? reference_copysignd(result, x) : result;
5293 }
5294
reference_sqrt(double x)5295 double reference_sqrt(double x) { return sqrt(x); }
5296
reference_floor(double x)5297 double reference_floor(double x) { return floorf((float)x); }
5298
reference_ldexp(double value,int exponent)5299 double reference_ldexp(double value, int exponent)
5300 {
5301 #ifdef __MINGW32__
5302 /*
5303 * ====================================================
5304 * This function is from fdlibm: http://www.netlib.org
5305 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
5306 *
5307 * Developed at SunSoft, a Sun Microsystems, Inc. business.
5308 * Permission to use, copy, modify, and distribute this
5309 * software is freely granted, provided that this notice
5310 * is preserved.
5311 * ====================================================
5312 */
5313 if (!finite(value) || value == 0.0) return value;
5314 return scalbn(value, exponent);
5315 #else
5316 return reference_scalbn(value, exponent);
5317 #endif
5318 }
5319
reference_ldexpl(long double x,int n)5320 long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
5321
reference_coshl(long double x)5322 long double reference_coshl(long double x) { return coshl(x); }
5323
reference_ceil(double x)5324 double reference_ceil(double x) { return ceilf((float)x); }
5325
reference_ceill(long double x)5326 long double reference_ceill(long double x)
5327 {
5328 if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5329
5330 long double absx = reference_fabsl(x);
5331 if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5332
5333 if (absx < 1.0)
5334 {
5335 if (x < 0.0)
5336 return 0.0;
5337 else
5338 return 1.0;
5339 }
5340
5341 long double r = (long double)((cl_long)x);
5342
5343 if (x > 0.0 && r < x) r += 1.0;
5344
5345 return r;
5346 }
5347
5348
reference_acosl(long double x)5349 long double reference_acosl(long double x)
5350 {
5351 long double x2 = x * x;
5352 int i;
5353
5354 // Prepare a head + tail representation of PI in long double. A good
5355 // compiler should get rid of all of this work.
5356 static const cl_ulong pi_bits[2] = {
5357 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
5358 }; // first 126 bits of pi
5359 // http://www.super-computing.org/pi-hexa_current.html
5360 long double head, tail, temp;
5361 #if __LDBL_MANT_DIG__ >= 64
5362 // long double has 64-bits of precision or greater
5363 temp = (long double)pi_bits[0] * 0x1.0p64L;
5364 head = temp + (long double)pi_bits[1];
5365 temp -= head; // rounding err rounding pi_bits[1] into head
5366 tail = (long double)pi_bits[1] + temp;
5367 head *= HEX_LDBL(+, 1, 0, -, 125);
5368 tail *= HEX_LDBL(+, 1, 0, -, 125);
5369 #else
5370 head = (long double)pi_bits[0];
5371 tail =
5372 (long double)((cl_long)pi_bits[0]
5373 - (cl_long)
5374 head); // residual part of pi_bits[0] after rounding
5375 tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
5376 head *= HEX_LDBL(+, 1, 0, -, 61);
5377 tail *= HEX_LDBL(+, 1, 0, -, 125);
5378 #endif
5379
5380 // oversize values and NaNs go to NaN
5381 if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
5382
5383 //
5384 // deal with large |x|:
5385 // sqrt( 1 - x**2)
5386 // acos(|x| > sqrt(0.5)) = 2 * atan( z ); z = -------------------- ;
5387 // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
5388 // 1 + x
5389 if (x2 > 0.5)
5390 {
5391 // we handle the x < 0 case as pi - acos(|x|)
5392
5393 long double sign = reference_copysignl(1.0L, x);
5394 long double fabsx = reference_fabsl(x);
5395 head -= head * sign; // x > 0 ? 0 : pi.hi
5396 tail -= tail * sign; // x > 0 ? 0 : pi.low
5397
5398 // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
5399 // (1-x)/(1+x) )
5400 long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
5401 long double z = sign * sqrtl(z2);
5402
5403 // atan(sqrt(q))
5404 // Minimax fit p(x) = ---------------- - 1
5405 // sqrt(q)
5406 //
5407 // Define q = r*r, and solve for atan(r):
5408 //
5409 // atan(r) = (p(r) + 1) * r = rp(r) + r
5410 static long double atan_coeffs[] = {
5411 HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
5412 HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
5413 HEX_LDBL(+, c, ccccccccc992407, -, 6),
5414 HEX_LDBL(-, 9, 24924923024398, -, 6),
5415 HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
5416 HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
5417 HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
5418 HEX_LDBL(-, 8, 88580517884c547, -, 7),
5419 HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
5420 HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
5421 HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
5422 HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
5423 HEX_LDBL(+, e, d67c16582123937, -, 10)
5424 }; // minimax fit over [ 0x1.0p-52, 0.18] Max error:
5425 // 0x1.67ea5c184e5d9p-64
5426
5427 // Calculate y = p(r)
5428 const size_t atan_coeff_count =
5429 sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
5430 long double y = atan_coeffs[atan_coeff_count - 1];
5431 for (i = (int)atan_coeff_count - 2; i >= 0; i--)
5432 y = atan_coeffs[i] + y * z2;
5433
5434 z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
5435 y *= z; // rp(r)
5436
5437 return head + ((y + tail) + z);
5438 }
5439
5440 // do |x| <= sqrt(0.5) here
5441 // acos( sqrt(z) ) -
5442 // PI/2
5443 // Piecewise minimax polynomial fits for p(z) = 1 +
5444 // ------------------------;
5445 // sqrt(z)
5446 //
5447 // Define z = x*x, and solve for acos(x) over x in x >= 0:
5448 //
5449 // acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
5450 //
5451 const long double coeffs[4][14] = {
5452 { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
5453 HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
5454 HEX_LDBL(-, a, c526184bd558c17, -, 7),
5455 HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
5456 HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
5457 HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
5458 HEX_LDBL(-, c, c71b00479541d8e, -, 9),
5459 HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
5460 HEX_LDBL(-, 9, a93060051f48d14, -, 9),
5461 HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
5462 HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
5463 HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
5464 HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
5465 HEX_LDBL(-, d, 6a3851330e15526, -,
5466 10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
5467 // Error: 0x1.97839bf07024p-76
5468
5469 { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
5470 HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
5471 HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
5472 HEX_LDBL(-, a, 086fafac220f16b, -, 7),
5473 HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
5474 HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
5475 HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
5476 HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
5477 HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
5478 HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
5479 HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
5480 HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
5481 HEX_LDBL(-, 9, 4468907f007727, -, 7),
5482 HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
5483 7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5484 // 0x1.647af70073457p-73
5485
5486 { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
5487 HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
5488 HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
5489 HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
5490 HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
5491 HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
5492 HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
5493 HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
5494 HEX_LDBL(-, c, b673b12794edb24, -, 6),
5495 HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
5496 HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
5497 HEX_LDBL(-, c, 461e725765a7581, -, 5),
5498 HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
5499 HEX_LDBL(-, a, 6de6cbd96c80562, -,
5500 4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5501 // 0x1.b0246c304ce1ap-70
5502
5503 { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
5504 HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
5505 HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
5506 HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
5507 HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
5508 HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
5509 HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
5510 HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
5511 HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
5512 HEX_LDBL(-, a, dbd760a1882da48, -, 3),
5513 HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
5514 HEX_LDBL(-, c, c05d7730963e793, -, 2),
5515 HEX_LDBL(-, a, 523d97197cd124a, -, 1),
5516 HEX_LDBL(-, 8, 307ba943978aaee, +,
5517 0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5518 // 0x1.9ecff73da69c9p-66
5519 };
5520
5521 const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
5522 const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
5523
5524 // reduce the incoming values a bit so that they are in the range
5525 // [-0x1.0p-4, 0x1.0p-4]
5526 const long double *c;
5527 i = x2 * 8.0L;
5528 c = coeffs[i];
5529 x2 -= offsets[i]; // exact
5530
5531 // calcualte p(x2)
5532 long double y = c[coeff_count - 1];
5533 for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
5534
5535 // xp(x2)
5536 y *= x;
5537
5538 // return xp(x2) - x + PI/2
5539 return head + ((y + tail) - x);
5540 }
5541
reference_relaxed_acos(double x)5542 double reference_relaxed_acos(double x) { return reference_acos(x); }
5543
reference_log10(double x)5544 double reference_log10(double x)
5545 {
5546 if (x == 0.0) return -INFINITY;
5547
5548 if (x < 0.0) return cl_make_nan();
5549
5550 if (isinf(x)) return INFINITY;
5551
5552 double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5553 double logxHi, logxLo;
5554 __log2_ep(&logxHi, &logxLo, x);
5555 return logxHi * log2Hi;
5556 }
5557
reference_relaxed_log10(double x)5558 double reference_relaxed_log10(double x) { return reference_log10(x); }
5559
reference_log10l(long double x)5560 long double reference_log10l(long double x)
5561 {
5562 if (x == 0.0) return -INFINITY;
5563
5564 if (x < 0.0) return cl_make_nan();
5565
5566 if (isinf(x)) return INFINITY;
5567
5568 double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5569 double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
5570 double logxHi, logxLo;
5571 __log2_ep(&logxHi, &logxLo, x);
5572
5573 long double lg2 = (long double)log2Hi + (long double)log2Lo;
5574 long double logx = (long double)logxHi + (long double)logxLo;
5575 return logx * lg2;
5576 }
5577
reference_acos(double x)5578 double reference_acos(double x) { return acos(x); }
5579
reference_atan2(double x,double y)5580 double reference_atan2(double x, double y)
5581 {
5582 #if defined(_WIN32)
5583 // fix edge cases for Windows
5584 if (isinf(x) && isinf(y))
5585 {
5586 double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5587 return (x > 0) ? retval : -retval;
5588 }
5589 #endif // _WIN32
5590 return atan2(x, y);
5591 }
5592
reference_atan2l(long double x,long double y)5593 long double reference_atan2l(long double x, long double y)
5594 {
5595 #if defined(_WIN32)
5596 // fix edge cases for Windows
5597 if (isinf(x) && isinf(y))
5598 {
5599 long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5600 return (x > 0) ? retval : -retval;
5601 }
5602 #endif // _WIN32
5603 return atan2l(x, y);
5604 }
5605
reference_frexp(double a,int * exp)5606 double reference_frexp(double a, int *exp)
5607 {
5608 if (isnan(a) || isinf(a) || a == 0.0)
5609 {
5610 *exp = 0;
5611 return a;
5612 }
5613
5614 union {
5615 cl_double d;
5616 cl_ulong l;
5617 } u;
5618
5619 u.d = a;
5620
5621 // separate out sign
5622 cl_ulong s = u.l & 0x8000000000000000ULL;
5623 u.l &= 0x7fffffffffffffffULL;
5624 int bias = -1022;
5625
5626 if ((u.l & 0x7ff0000000000000ULL) == 0)
5627 {
5628 double d = u.l;
5629 u.d = d;
5630 bias -= 1074;
5631 }
5632
5633 int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52);
5634 u.l &= 0x000fffffffffffffULL;
5635 e += bias;
5636 u.l |= ((cl_ulong)1022 << 52);
5637 u.l |= s;
5638
5639 *exp = e;
5640 return u.d;
5641 }
5642
reference_frexpl(long double a,int * exp)5643 long double reference_frexpl(long double a, int *exp)
5644 {
5645 if (isnan(a) || isinf(a) || a == 0.0)
5646 {
5647 *exp = 0;
5648 return a;
5649 }
5650
5651 if (sizeof(long double) == sizeof(double))
5652 {
5653 return reference_frexp(a, exp);
5654 }
5655 else
5656 {
5657 return frexpl(a, exp);
5658 }
5659 }
5660
5661
reference_atan(double x)5662 double reference_atan(double x) { return atan(x); }
5663
reference_atanl(long double x)5664 long double reference_atanl(long double x) { return atanl(x); }
5665
reference_asinl(long double x)5666 long double reference_asinl(long double x) { return asinl(x); }
5667
reference_asin(double x)5668 double reference_asin(double x) { return asin(x); }
5669
reference_relaxed_asin(double x)5670 double reference_relaxed_asin(double x) { return reference_asin(x); }
5671
reference_fabs(double x)5672 double reference_fabs(double x) { return fabs(x); }
5673
reference_cosh(double x)5674 double reference_cosh(double x) { return cosh(x); }
5675
reference_sqrtl(long double x)5676 long double reference_sqrtl(long double x)
5677 {
5678 #if defined(__SSE2__) \
5679 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
5680 __m128d result128 = _mm_set_sd((double)x);
5681 result128 = _mm_sqrt_sd(result128, result128);
5682 return _mm_cvtsd_f64(result128);
5683 #else
5684 volatile double dx = x;
5685 return sqrt(dx);
5686 #endif
5687 }
5688
reference_tanhl(long double x)5689 long double reference_tanhl(long double x) { return tanhl(x); }
5690
reference_floorl(long double x)5691 long double reference_floorl(long double x)
5692 {
5693 if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5694
5695 long double absx = reference_fabsl(x);
5696 if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5697
5698 if (absx < 1.0)
5699 {
5700 if (x < 0.0)
5701 return -1.0;
5702 else
5703 return 0.0;
5704 }
5705
5706 long double r = (long double)((cl_long)x);
5707
5708 if (x < 0.0 && r > x) r -= 1.0;
5709
5710 return r;
5711 }
5712
5713
reference_tanh(double x)5714 double reference_tanh(double x) { return tanh(x); }
5715
reference_assignmentl(long double x)5716 long double reference_assignmentl(long double x) { return x; }
5717
reference_notl(long double x)5718 int reference_notl(long double x)
5719 {
5720 int r = !x;
5721 return r;
5722 }
5723