1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "reference_math.h"
18 #include "harness/compat.h"
19 
20 #include <climits>
21 
22 #if !defined(_WIN32)
23 #include <cstring>
24 #endif
25 
26 #include "utility.h"
27 
28 #if defined(__SSE__)                                                           \
29     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
30 #include <xmmintrin.h>
31 #endif
32 #if defined(__SSE2__)                                                          \
33     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
34 #include <emmintrin.h>
35 #endif
36 
37 #ifndef M_PI_4
38 #define M_PI_4 (M_PI / 4)
39 #endif
40 
41 #pragma STDC FP_CONTRACT OFF
42 static void __log2_ep(double *hi, double *lo, double x);
43 
44 typedef union {
45     uint64_t i;
46     double d;
47 } uint64d_t;
48 
49 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
50 
51 #define cl_make_nan() _CL_NAN.d
52 
reduce1(double x)53 static double reduce1(double x)
54 {
55     if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
56     {
57         if (fabs(x) == INFINITY) return cl_make_nan();
58 
59         return 0.0; // we patch up the sign for sinPi and cosPi later, since
60                     // they need different signs
61     }
62 
63     // Find the nearest multiple of 2
64     const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
65     double z = x + r;
66     z -= r;
67 
68     // subtract it from x. Value is now in the range -1 <= x <= 1
69     return x - z;
70 }
71 
reference_acospi(double x)72 double reference_acospi(double x) { return reference_acos(x) / M_PI; }
reference_asinpi(double x)73 double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
reference_atanpi(double x)74 double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
reference_atan2pi(double y,double x)75 double reference_atan2pi(double y, double x)
76 {
77     return reference_atan2(y, x) / M_PI;
78 }
reference_cospi(double x)79 double reference_cospi(double x)
80 {
81     if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
82     {
83         if (reference_fabs(x) == INFINITY) return cl_make_nan();
84 
85         // Note this probably fails for odd values between 0x1.0p52 and
86         // 0x1.0p53. However, when starting with single precision inputs, there
87         // will be no odd values.
88 
89         return 1.0;
90     }
91 
92     x = reduce1(x + 0.5);
93 
94     // reduce to [-0.5, 0.5]
95     if (x < -0.5)
96         x = -1 - x;
97     else if (x > 0.5)
98         x = 1 - x;
99 
100     // cosPi zeros are all +0
101     if (x == 0.0) return 0.0;
102 
103     return reference_sin(x * M_PI);
104 }
105 
reference_relaxed_cospi(double x)106 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
107 
reference_relaxed_divide(double x,double y)108 double reference_relaxed_divide(double x, double y)
109 {
110     return (float)(((float)x) / ((float)y));
111 }
112 
reference_divide(double x,double y)113 double reference_divide(double x, double y) { return x / y; }
114 
115 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
add_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)116 static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
117 {
118     cl_ulong result = a + b;
119     *carry = result < a;
120     return result;
121 }
122 
123 // Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
124 // 0
sub_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)125 static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
126 {
127     cl_ulong result = a - b;
128     *carry = result > a;
129     return result;
130 }
131 
fallback_frexpf(float x,int * iptr)132 static float fallback_frexpf(float x, int *iptr)
133 {
134     cl_uint u, v;
135     float fu, fv;
136 
137     memcpy(&u, &x, sizeof(u));
138 
139     cl_uint exponent = u & 0x7f800000U;
140     cl_uint mantissa = u & ~0x7f800000U;
141 
142     // add 1 to the exponent
143     exponent += 0x00800000U;
144 
145     if ((cl_int)exponent < (cl_int)0x01000000)
146     { // subnormal, NaN, Inf
147         mantissa |= 0x3f000000U;
148 
149         v = mantissa & 0xff800000U;
150         u = mantissa;
151         memcpy(&fv, &v, sizeof(v));
152         memcpy(&fu, &u, sizeof(u));
153 
154         fu -= fv;
155 
156         memcpy(&v, &fv, sizeof(v));
157         memcpy(&u, &fu, sizeof(u));
158 
159         exponent = u & 0x7f800000U;
160         mantissa = u & ~0x7f800000U;
161 
162         *iptr = (exponent >> 23) + (-126 + 1 - 126);
163         u = mantissa | 0x3f000000U;
164         memcpy(&fu, &u, sizeof(u));
165         return fu;
166     }
167 
168     *iptr = (exponent >> 23) - 127;
169     u = mantissa | 0x3f000000U;
170     memcpy(&fu, &u, sizeof(u));
171     return fu;
172 }
173 
extractf(float x,cl_uint * mant)174 static inline int extractf(float x, cl_uint *mant)
175 {
176     static float (*frexppf)(float, int *) = NULL;
177     int e;
178 
179     // verify that frexp works properly
180     if (NULL == frexppf)
181     {
182         if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
183             frexppf = frexpf;
184         else
185             frexppf = fallback_frexpf;
186     }
187 
188     *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
189     return e - 1;
190 }
191 
192 // Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
193 // together and ORd into the LSB of the result
shift_right_sticky_64(cl_ulong * p,int shift)194 static inline void shift_right_sticky_64(cl_ulong *p, int shift)
195 {
196     cl_ulong sticky = 0;
197     cl_ulong r = *p;
198 
199     // C doesn't handle shifts greater than the size of the variable dependably
200     if (shift >= 64)
201     {
202         sticky |= (0 != r);
203         r = 0;
204     }
205     else
206     {
207         sticky |= (0 != (r << (64 - shift)));
208         r >>= shift;
209     }
210 
211     *p = r | sticky;
212 }
213 
214 // Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
215 // into the LSB of the result
add64(cl_ulong * p,cl_ulong c,int * exponent)216 static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
217 {
218     cl_ulong carry;
219     c = add_carry(c, *p, &carry);
220     if (carry)
221     {
222         carry = c & 1; // set aside sticky bit
223         c >>= 1; // right shift to deal with overflow
224         c |= carry
225             | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
226                                      // latter is to prevent rounding from
227                                      // believing we are exact half way case
228         *exponent = *exponent + 1; // adjust exponent
229     }
230 
231     *p = c;
232 }
233 
234 // IEEE-754 round to nearest, ties to even rounding
round_to_nearest_even_float(cl_ulong p,int exponent)235 static float round_to_nearest_even_float(cl_ulong p, int exponent)
236 {
237     union {
238         cl_uint u;
239         cl_float d;
240     } u;
241 
242     // If mantissa is zero, return 0.0f
243     if (p == 0) return 0.0f;
244 
245     // edges
246     if (exponent > 127)
247     {
248         volatile float r = exponent * CL_FLT_MAX; // signal overflow
249 
250         // attempt to fool the compiler into not optimizing the above line away
251         if (r > CL_FLT_MAX) return INFINITY;
252 
253         return r;
254     }
255     if (exponent == -150 && p > 0x8000000000000000ULL)
256         return HEX_FLT(+, 1, 0, -, 149);
257     if (exponent <= -150) return 0.0f;
258 
259     // Figure out which bits go where
260     int shift = 8 + 32;
261     if (exponent < -126)
262     {
263         shift -= 126 + exponent; // subnormal: shift is not 52
264         exponent = -127; //            set exponent to 0
265     }
266     else
267         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
268                                     // it.
269 
270     // Assemble the double (round toward zero)
271     u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
272 
273     // put a representation of the residual bits into hi
274     p <<= (64 - shift);
275 
276     // round to nearest, ties to even  based on the unused portion of p
277     if (p < 0x8000000000000000ULL) return u.d;
278     if (p == 0x8000000000000000ULL)
279         u.u += u.u & 1U;
280     else
281         u.u++;
282 
283     return u.d;
284 }
285 
round_to_nearest_even_float_ftz(cl_ulong p,int exponent)286 static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
287 {
288     extern int gCheckTininessBeforeRounding;
289 
290     union {
291         cl_uint u;
292         cl_float d;
293     } u;
294     int shift = 8 + 32;
295 
296     // If mantissa is zero, return 0.0f
297     if (p == 0) return 0.0f;
298 
299     // edges
300     if (exponent > 127)
301     {
302         volatile float r = exponent * CL_FLT_MAX; // signal overflow
303 
304         // attempt to fool the compiler into not optimizing the above line away
305         if (r > CL_FLT_MAX) return INFINITY;
306 
307         return r;
308     }
309 
310     // Deal with FTZ for gCheckTininessBeforeRounding
311     if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
312 
313     if (exponent
314         == -127) // only happens for machines that check tininess after rounding
315         p = (p & 1) | (p >> 1);
316     else
317         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
318                                     // it.
319 
320     cl_ulong q = p;
321 
322 
323     // Assemble the double (round toward zero)
324     u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
325 
326     // put a representation of the residual bits into hi
327     q <<= (64 - shift);
328 
329     // round to nearest, ties to even  based on the unused portion of p
330     if (q > 0x8000000000000000ULL)
331         u.u++;
332     else if (q == 0x8000000000000000ULL)
333         u.u += u.u & 1U;
334 
335     // Deal with FTZ for ! gCheckTininessBeforeRounding
336     if (0 == (u.u & 0x7f800000U)) return 0.0f;
337 
338     return u.d;
339 }
340 
341 
342 // IEEE-754 round toward zero.
round_toward_zero_float(cl_ulong p,int exponent)343 static float round_toward_zero_float(cl_ulong p, int exponent)
344 {
345     union {
346         cl_uint u;
347         cl_float d;
348     } u;
349 
350     // If mantissa is zero, return 0.0f
351     if (p == 0) return 0.0f;
352 
353     // edges
354     if (exponent > 127)
355     {
356         volatile float r = exponent * CL_FLT_MAX; // signal overflow
357 
358         // attempt to fool the compiler into not optimizing the above line away
359         if (r > CL_FLT_MAX) return CL_FLT_MAX;
360 
361         return r;
362     }
363 
364     if (exponent <= -149) return 0.0f;
365 
366     // Figure out which bits go where
367     int shift = 8 + 32;
368     if (exponent < -126)
369     {
370         shift -= 126 + exponent; // subnormal: shift is not 52
371         exponent = -127; //            set exponent to 0
372     }
373     else
374         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
375                                     // it.
376 
377     // Assemble the double (round toward zero)
378     u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
379 
380     return u.d;
381 }
382 
round_toward_zero_float_ftz(cl_ulong p,int exponent)383 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
384 {
385     union {
386         cl_uint u;
387         cl_float d;
388     } u;
389     int shift = 8 + 32;
390 
391     // If mantissa is zero, return 0.0f
392     if (p == 0) return 0.0f;
393 
394     // edges
395     if (exponent > 127)
396     {
397         volatile float r = exponent * CL_FLT_MAX; // signal overflow
398 
399         // attempt to fool the compiler into not optimizing the above line away
400         if (r > CL_FLT_MAX) return CL_FLT_MAX;
401 
402         return r;
403     }
404 
405     // Deal with FTZ for gCheckTininessBeforeRounding
406     if (exponent < -126) return 0.0f;
407 
408     cl_ulong q = p &=
409         0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
410 
411     // Assemble the double (round toward zero)
412     u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
413 
414     // put a representation of the residual bits into hi
415     q <<= (64 - shift);
416 
417     return u.d;
418 }
419 
420 // Subtract two significands.
sub64(cl_ulong * c,cl_ulong p,cl_uint * signC,int * expC)421 static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
422 {
423     cl_ulong carry;
424     p = sub_carry(*c, p, &carry);
425 
426     if (carry)
427     {
428         *signC ^= 0x80000000U;
429         p = -p;
430     }
431 
432     // normalize
433     if (p)
434     {
435         int shift = 32;
436         cl_ulong test = 1ULL << 32;
437         while (0 == (p & 0x8000000000000000ULL))
438         {
439             if (p < test)
440             {
441                 p <<= shift;
442                 *expC = *expC - shift;
443             }
444             shift >>= 1;
445             test <<= shift;
446         }
447     }
448     else
449     {
450         // zero result.
451         *expC = -200;
452         *signC =
453             0; // IEEE rules say a - a = +0 for all rounding modes except -inf
454     }
455 
456     *c = p;
457 }
458 
459 
reference_fma(float a,float b,float c,int shouldFlush)460 float reference_fma(float a, float b, float c, int shouldFlush)
461 {
462     static const cl_uint kMSB = 0x80000000U;
463 
464     // Make bits accessible
465     union {
466         cl_uint u;
467         cl_float d;
468     } ua;
469     ua.d = a;
470     union {
471         cl_uint u;
472         cl_float d;
473     } ub;
474     ub.d = b;
475     union {
476         cl_uint u;
477         cl_float d;
478     } uc;
479     uc.d = c;
480 
481     // deal with Nans, infinities and zeros
482     if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
483         || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
484         0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
485         0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
486     {
487         FPU_mode_type oldMode;
488         RoundingMode oldRoundMode = kRoundToNearestEven;
489         if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
490 
491         if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
492 
493         memset(&oldMode, 0, sizeof(oldMode));
494         if (shouldFlush) ForceFTZ(&oldMode);
495 
496         a = (float)reference_multiply(
497             a, b); // some risk that the compiler will insert a non-compliant
498                    // fma here on some platforms.
499         a = (float)reference_add(
500             a,
501             c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
502 
503         if (shouldFlush) RestoreFPState(&oldMode);
504 
505         if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
506         return a;
507     }
508 
509     // extract exponent and mantissa
510     //   exponent is a standard unbiased signed integer
511     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
512     cl_uint mantA, mantB, mantC;
513     int expA = extractf(a, &mantA);
514     int expB = extractf(b, &mantB);
515     int expC = extractf(c, &mantC);
516     cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
517                                  // if we are adding or subtracting
518 
519     // exact product of A and B
520     int exponent = expA + expB;
521     cl_uint sign = (ua.u ^ ub.u) & kMSB;
522     cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
523 
524     // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
525     //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact
526     //  that we got another power of two from the multiplication
527     if (0 == (0x8000000000000000ULL & product))
528         product <<= 1;
529     else
530         exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
531                     // exponent increased.
532 
533     // infinite precision add
534     cl_ulong addend = (cl_ulong)mantC << 32;
535     if (exponent >= expC)
536     {
537         // Shift C relative to the product so that their exponents match
538         if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
539 
540         // Add
541         if (sign ^ signC)
542             sub64(&product, addend, &sign, &exponent);
543         else
544             add64(&product, addend, &exponent);
545     }
546     else
547     {
548         // Shift the product relative to C so that their exponents match
549         shift_right_sticky_64(&product, expC - exponent);
550 
551         // add
552         if (sign ^ signC)
553             sub64(&addend, product, &signC, &expC);
554         else
555             add64(&addend, product, &expC);
556 
557         product = addend;
558         exponent = expC;
559         sign = signC;
560     }
561 
562     // round to IEEE result -- we do not do flushing to zero here. That part is
563     // handled manually in ternary.c.
564     if (gIsInRTZMode)
565     {
566         if (shouldFlush)
567             ua.d = round_toward_zero_float_ftz(product, exponent);
568         else
569             ua.d = round_toward_zero_float(product, exponent);
570     }
571     else
572     {
573         if (shouldFlush)
574             ua.d = round_to_nearest_even_float_ftz(product, exponent);
575         else
576             ua.d = round_to_nearest_even_float(product, exponent);
577     }
578 
579     // Set the sign
580     ua.u |= sign;
581 
582     return ua.d;
583 }
584 
reference_relaxed_exp10(double x)585 double reference_relaxed_exp10(double x) { return reference_exp10(x); }
586 
reference_exp10(double x)587 double reference_exp10(double x)
588 {
589     return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
590 }
591 
592 
reference_ilogb(double x)593 int reference_ilogb(double x)
594 {
595     extern int gDeviceILogb0, gDeviceILogbNaN;
596     union {
597         cl_double f;
598         cl_ulong u;
599     } u;
600 
601     u.f = (float)x;
602     cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
603     if (exponent == 0x7ff)
604     {
605         if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
606 
607         return CL_INT_MAX;
608     }
609 
610     if (exponent == 0)
611     { // deal with denormals
612         u.f = x * HEX_DBL(+, 1, 0, +, 64);
613         exponent = (cl_int)(u.u >> 52) & 0x7ff;
614         if (exponent == 0) return gDeviceILogb0;
615 
616         return exponent - (1023 + 64);
617     }
618 
619     return exponent - 1023;
620 }
621 
reference_nan(cl_uint x)622 double reference_nan(cl_uint x)
623 {
624     union {
625         cl_uint u;
626         cl_float f;
627     } u;
628     u.u = x | 0x7fc00000U;
629     return (double)u.f;
630 }
631 
reference_maxmag(double x,double y)632 double reference_maxmag(double x, double y)
633 {
634     double fabsx = fabs(x);
635     double fabsy = fabs(y);
636 
637     if (fabsx < fabsy) return y;
638 
639     if (fabsy < fabsx) return x;
640 
641     return reference_fmax(x, y);
642 }
643 
reference_minmag(double x,double y)644 double reference_minmag(double x, double y)
645 {
646     double fabsx = fabs(x);
647     double fabsy = fabs(y);
648 
649     if (fabsx > fabsy) return y;
650 
651     if (fabsy > fabsx) return x;
652 
653     return reference_fmin(x, y);
654 }
655 
reference_relaxed_mad(double a,double b,double c)656 double reference_relaxed_mad(double a, double b, double c)
657 {
658     return ((float)a) * ((float)b) + (float)c;
659 }
660 
reference_mad(double a,double b,double c)661 double reference_mad(double a, double b, double c) { return a * b + c; }
662 
reference_recip(double x)663 double reference_recip(double x) { return 1.0 / x; }
reference_rootn(double x,int i)664 double reference_rootn(double x, int i)
665 {
666 
667     // rootn ( x, 0 )  returns a NaN.
668     if (0 == i) return cl_make_nan();
669 
670     // rootn ( x, n )  returns a NaN for x < 0 and n is even.
671     if (x < 0 && 0 == (i & 1)) return cl_make_nan();
672 
673     if (x == 0.0)
674     {
675         switch (i & 0x80000001)
676         {
677             // rootn ( +-0,  n ) is +0 for even n > 0.
678             case 0: return 0.0f;
679 
680             // rootn ( +-0,  n ) is +-0 for odd n > 0.
681             case 1: return x;
682 
683             // rootn ( +-0,  n ) is +inf for even n < 0.
684             case 0x80000000: return INFINITY;
685 
686             // rootn ( +-0,  n ) is +-inf for odd n < 0.
687             case 0x80000001: return copysign(INFINITY, x);
688         }
689     }
690 
691     double sign = x;
692     x = reference_fabs(x);
693     x = reference_exp2(reference_log2(x) / (double)i);
694     return reference_copysignd(x, sign);
695 }
696 
reference_rsqrt(double x)697 double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
698 
reference_sinpi(double x)699 double reference_sinpi(double x)
700 {
701     double r = reduce1(x);
702 
703     // reduce to [-0.5, 0.5]
704     if (r < -0.5)
705         r = -1 - r;
706     else if (r > 0.5)
707         r = 1 - r;
708 
709     // sinPi zeros have the same sign as x
710     if (r == 0.0) return reference_copysignd(0.0, x);
711 
712     return reference_sin(r * M_PI);
713 }
714 
reference_relaxed_sinpi(double x)715 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
716 
reference_tanpi(double x)717 double reference_tanpi(double x)
718 {
719     // set aside the sign  (allows us to preserve sign of -0)
720     double sign = reference_copysignd(1.0, x);
721     double z = reference_fabs(x);
722 
723     // if big and even  -- caution: only works if x only has single precision
724     if (z >= HEX_DBL(+, 1, 0, +, 24))
725     {
726         if (z == INFINITY) return x - x; // nan
727 
728         return reference_copysignd(
729             0.0, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
730     }
731 
732     // reduce to the range [ -0.5, 0.5 ]
733     double nearest = reference_rint(z); // round to nearest even places n + 0.5
734                                         // values in the right place for us
735     int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
736     z -= nearest;
737 
738     // correction for odd integer x for the right sign of zero
739     if ((i & 1) && z == 0.0) sign = -sign;
740 
741     // track changes to the sign
742     sign *= reference_copysignd(1.0, z); // really should just be an xor
743     z = reference_fabs(z); // remove the sign again
744 
745     // reduce once more
746     // If we don't do this, rounding error in z * M_PI will cause us not to
747     // return infinities properly
748     if (z > 0.25)
749     {
750         z = 0.5 - z;
751         return sign
752             / reference_tan(z * M_PI); // use system tan to get the right result
753     }
754 
755     //
756     return sign
757         * reference_tan(z * M_PI); // use system tan to get the right result
758 }
759 
reference_pown(double x,int i)760 double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
reference_powr(double x,double y)761 double reference_powr(double x, double y)
762 {
763     // powr ( x, y ) returns NaN for x < 0.
764     if (x < 0.0) return cl_make_nan();
765 
766     // powr ( x, NaN ) returns the NaN for x >= 0.
767     // powr ( NaN, y ) returns the NaN.
768     if (isnan(x) || isnan(y))
769         return x + y; // Note: behavior different here than for pow(1,NaN),
770                       // pow(NaN, 0)
771 
772     if (x == 1.0)
773     {
774         // powr ( +1, +-inf ) returns NaN.
775         if (reference_fabs(y) == INFINITY) return cl_make_nan();
776 
777         // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
778         return 1.0;
779     }
780 
781     if (y == 0.0)
782     {
783         // powr ( +inf, +-0 ) returns NaN.
784         // powr ( +-0, +-0 ) returns NaN.
785         if (x == 0.0 || x == INFINITY) return cl_make_nan();
786 
787         // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
788         // handled above)
789         return 1.0;
790     }
791 
792     if (x == 0.0)
793     {
794         // powr ( +-0, -inf) is +inf.
795         // powr ( +-0, y ) is +inf for finite y < 0.
796         if (y < 0.0) return INFINITY;
797 
798         // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
799         return 0.0;
800     }
801 
802     // x = +inf
803     if (isinf(x))
804     {
805         if (y < 0) return 0;
806         return INFINITY;
807     }
808 
809     double fabsx = reference_fabs(x);
810     double fabsy = reference_fabs(y);
811 
812     // y = +-inf cases
813     if (isinf(fabsy))
814     {
815         if (y < 0)
816         {
817             if (fabsx < 1) return INFINITY;
818             return 0;
819         }
820         if (fabsx < 1) return 0;
821         return INFINITY;
822     }
823 
824     double hi, lo;
825     __log2_ep(&hi, &lo, x);
826     double prod = y * hi;
827     double result = reference_exp2(prod);
828 
829     return result;
830 }
831 
reference_fract(double x,double * ip)832 double reference_fract(double x, double *ip)
833 {
834     if (isnan(x))
835     {
836         *ip = cl_make_nan();
837         return cl_make_nan();
838     }
839 
840     float i;
841     float f = modff((float)x, &i);
842     if (f < 0.0)
843     {
844         f = 1.0f + f;
845         i -= 1.0f;
846         if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
847     }
848     *ip = i;
849     return f;
850 }
851 
852 
reference_add(double x,double y)853 double reference_add(double x, double y)
854 {
855     volatile float a = (float)x;
856     volatile float b = (float)y;
857 
858 #if defined(__SSE__)                                                           \
859     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
860     // defeat x87
861     __m128 va = _mm_set_ss((float)a);
862     __m128 vb = _mm_set_ss((float)b);
863     va = _mm_add_ss(va, vb);
864     _mm_store_ss((float *)&a, va);
865 #elif defined(__PPC__)
866     // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
867     // denorm's to zero. As such, the reference add with FTZ must be emulated in
868     // sw.
869     if (fpu_control & _FPU_MASK_NI)
870     {
871         union {
872             cl_uint u;
873             cl_float d;
874         } ua;
875         ua.d = a;
876         union {
877             cl_uint u;
878             cl_float d;
879         } ub;
880         ub.d = b;
881         cl_uint mantA, mantB;
882         cl_ulong addendA, addendB, sum;
883         int expA = extractf(a, &mantA);
884         int expB = extractf(b, &mantB);
885         cl_uint signA = ua.u & 0x80000000U;
886         cl_uint signB = ub.u & 0x80000000U;
887 
888         // Force matching exponents if an operand is 0
889         if (a == 0.0f)
890         {
891             expA = expB;
892         }
893         else if (b == 0.0f)
894         {
895             expB = expA;
896         }
897 
898         addendA = (cl_ulong)mantA << 32;
899         addendB = (cl_ulong)mantB << 32;
900 
901         if (expA >= expB)
902         {
903             // Shift B relative to the A so that their exponents match
904             if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
905 
906             // add
907             if (signA ^ signB)
908                 sub64(&addendA, addendB, &signA, &expA);
909             else
910                 add64(&addendA, addendB, &expA);
911         }
912         else
913         {
914             // Shift the A relative to B so that their exponents match
915             shift_right_sticky_64(&addendA, expB - expA);
916 
917             // add
918             if (signA ^ signB)
919                 sub64(&addendB, addendA, &signB, &expB);
920             else
921                 add64(&addendB, addendA, &expB);
922 
923             addendA = addendB;
924             expA = expB;
925             signA = signB;
926         }
927 
928         // round to IEEE result
929         if (gIsInRTZMode)
930         {
931             ua.d = round_toward_zero_float_ftz(addendA, expA);
932         }
933         else
934         {
935             ua.d = round_to_nearest_even_float_ftz(addendA, expA);
936         }
937         // Set the sign
938         ua.u |= signA;
939         a = ua.d;
940     }
941     else
942     {
943         a += b;
944     }
945 #else
946     a += b;
947 #endif
948     return (double)a;
949 }
950 
951 
reference_subtract(double x,double y)952 double reference_subtract(double x, double y)
953 {
954     volatile float a = (float)x;
955     volatile float b = (float)y;
956 #if defined(__SSE__)                                                           \
957     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
958     // defeat x87
959     __m128 va = _mm_set_ss((float)a);
960     __m128 vb = _mm_set_ss((float)b);
961     va = _mm_sub_ss(va, vb);
962     _mm_store_ss((float *)&a, va);
963 #else
964     a -= b;
965 #endif
966     return a;
967 }
968 
reference_multiply(double x,double y)969 double reference_multiply(double x, double y)
970 {
971     volatile float a = (float)x;
972     volatile float b = (float)y;
973 #if defined(__SSE__)                                                           \
974     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
975     // defeat x87
976     __m128 va = _mm_set_ss((float)a);
977     __m128 vb = _mm_set_ss((float)b);
978     va = _mm_mul_ss(va, vb);
979     _mm_store_ss((float *)&a, va);
980 #elif defined(__PPC__)
981     // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
982     // denorm's to zero. As such, the reference multiply with FTZ must be
983     // emulated in sw.
984     if (fpu_control & _FPU_MASK_NI)
985     {
986         // extract exponent and mantissa
987         //   exponent is a standard unbiased signed integer
988         //   mantissa is a cl_uint, with leading non-zero bit positioned at the
989         //   MSB
990         union {
991             cl_uint u;
992             cl_float d;
993         } ua;
994         ua.d = a;
995         union {
996             cl_uint u;
997             cl_float d;
998         } ub;
999         ub.d = b;
1000         cl_uint mantA, mantB;
1001         int expA = extractf(a, &mantA);
1002         int expB = extractf(b, &mantB);
1003 
1004         // exact product of A and B
1005         int exponent = expA + expB;
1006         cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
1007         cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
1008 
1009         // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
1010         //  The MSB might not be set. If so, fix that. Otherwise, reflect the
1011         //  fact that we got another power of two from the multiplication
1012         if (0 == (0x8000000000000000ULL & product))
1013             product <<= 1;
1014         else
1015             exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
1016                         // our exponent increased.
1017 
1018         // round to IEEE result -- we do not do flushing to zero here. That part
1019         // is handled manually in ternary.c.
1020         if (gIsInRTZMode)
1021         {
1022             ua.d = round_toward_zero_float_ftz(product, exponent);
1023         }
1024         else
1025         {
1026             ua.d = round_to_nearest_even_float_ftz(product, exponent);
1027         }
1028         // Set the sign
1029         ua.u |= sign;
1030         a = ua.d;
1031     }
1032     else
1033     {
1034         a *= b;
1035     }
1036 #else
1037     a *= b;
1038 #endif
1039     return a;
1040 }
1041 
reference_lgamma_r(double x,int * signp)1042 double reference_lgamma_r(double x, int *signp)
1043 {
1044     // This is not currently tested
1045     *signp = 0;
1046     return x;
1047 }
1048 
1049 
reference_isequal(double x,double y)1050 int reference_isequal(double x, double y) { return x == y; }
reference_isfinite(double x)1051 int reference_isfinite(double x) { return 0 != isfinite(x); }
reference_isgreater(double x,double y)1052 int reference_isgreater(double x, double y) { return x > y; }
reference_isgreaterequal(double x,double y)1053 int reference_isgreaterequal(double x, double y) { return x >= y; }
reference_isinf(double x)1054 int reference_isinf(double x) { return 0 != isinf(x); }
reference_isless(double x,double y)1055 int reference_isless(double x, double y) { return x < y; }
reference_islessequal(double x,double y)1056 int reference_islessequal(double x, double y) { return x <= y; }
reference_islessgreater(double x,double y)1057 int reference_islessgreater(double x, double y)
1058 {
1059     return 0 != islessgreater(x, y);
1060 }
reference_isnan(double x)1061 int reference_isnan(double x) { return 0 != isnan(x); }
reference_isnormal(double x)1062 int reference_isnormal(double x) { return 0 != isnormal((float)x); }
reference_isnotequal(double x,double y)1063 int reference_isnotequal(double x, double y) { return x != y; }
reference_isordered(double x,double y)1064 int reference_isordered(double x, double y) { return x == x && y == y; }
reference_isunordered(double x,double y)1065 int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
reference_signbit(float x)1066 int reference_signbit(float x) { return 0 != signbit(x); }
1067 
1068 #if 1 // defined( _MSC_VER )
1069 
1070 // Missing functions for win32
1071 
1072 
reference_copysign(float x,float y)1073 float reference_copysign(float x, float y)
1074 {
1075     union {
1076         float f;
1077         cl_uint u;
1078     } ux, uy;
1079     ux.f = x;
1080     uy.f = y;
1081     ux.u &= 0x7fffffffU;
1082     ux.u |= uy.u & 0x80000000U;
1083     return ux.f;
1084 }
1085 
1086 
reference_copysignd(double x,double y)1087 double reference_copysignd(double x, double y)
1088 {
1089     union {
1090         double f;
1091         cl_ulong u;
1092     } ux, uy;
1093     ux.f = x;
1094     uy.f = y;
1095     ux.u &= 0x7fffffffffffffffULL;
1096     ux.u |= uy.u & 0x8000000000000000ULL;
1097     return ux.f;
1098 }
1099 
1100 
reference_round(double x)1101 double reference_round(double x)
1102 {
1103     double absx = reference_fabs(x);
1104     if (absx < 0.5) return reference_copysignd(0.0, x);
1105 
1106     if (absx < HEX_DBL(+, 1, 0, +, 53))
1107         x = reference_trunc(x + reference_copysignd(0.5, x));
1108 
1109     return x;
1110 }
1111 
reference_trunc(double x)1112 double reference_trunc(double x)
1113 {
1114     if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
1115     {
1116         cl_long l = (cl_long)x;
1117 
1118         return reference_copysignd((double)l, x);
1119     }
1120 
1121     return x;
1122 }
1123 
1124 #ifndef FP_ILOGB0
1125 #define FP_ILOGB0 INT_MIN
1126 #endif
1127 
1128 #ifndef FP_ILOGBNAN
1129 #define FP_ILOGBNAN INT_MAX
1130 #endif
1131 
1132 
reference_cbrt(double x)1133 double reference_cbrt(double x)
1134 {
1135     return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
1136 }
1137 
reference_rint(double x)1138 double reference_rint(double x)
1139 {
1140     if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
1141     {
1142         double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
1143         double rounded = (x + magic) - magic;
1144         x = reference_copysignd(rounded, x);
1145     }
1146 
1147     return x;
1148 }
1149 
reference_acosh(double x)1150 double reference_acosh(double x)
1151 { // not full precision. Sufficient precision to cover float
1152     if (isnan(x)) return x + x;
1153 
1154     if (x < 1.0) return cl_make_nan();
1155 
1156     return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
1157 }
1158 
reference_asinh(double x)1159 double reference_asinh(double x)
1160 {
1161     /*
1162      * ====================================================
1163      * This function is from fdlibm: http://www.netlib.org
1164      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1165      *
1166      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1167      * Permission to use, copy, modify, and distribute this
1168      * software is freely granted, provided that this notice
1169      * is preserved.
1170      * ====================================================
1171      */
1172     if (isnan(x) || isinf(x)) return x + x;
1173 
1174     double absx = reference_fabs(x);
1175     if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
1176 
1177     double sign = reference_copysignd(1.0, x);
1178 
1179     if (absx > HEX_DBL(+, 1, 0, +, 28))
1180         return sign
1181             * (reference_log(absx)
1182                + 0.693147180559945309417232121458176568); // log(2)
1183 
1184     if (absx > 2.0)
1185         return sign
1186             * reference_log(2.0 * absx
1187                             + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
1188 
1189     return sign
1190         * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
1191 }
1192 
1193 
reference_atanh(double x)1194 double reference_atanh(double x)
1195 {
1196     /*
1197      * ====================================================
1198      * This function is from fdlibm: http://www.netlib.org
1199      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1200      *
1201      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1202      * Permission to use, copy, modify, and distribute this
1203      * software is freely granted, provided that this notice
1204      * is preserved.
1205      * ====================================================
1206      */
1207     if (isnan(x)) return x + x;
1208 
1209     double signed_half = reference_copysignd(0.5, x);
1210     x = reference_fabs(x);
1211     if (x > 1.0) return cl_make_nan();
1212 
1213     if (x < 0.5)
1214         return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
1215 
1216     return signed_half * reference_log1p(2.0 * x / (1 - x));
1217 }
1218 
reference_relaxed_atan(double x)1219 double reference_relaxed_atan(double x) { return reference_atan(x); }
1220 
reference_relaxed_exp2(double x)1221 double reference_relaxed_exp2(double x) { return reference_exp2(x); }
1222 
reference_exp2(double x)1223 double reference_exp2(double x)
1224 { // Note: only suitable for verifying single precision. Doesn't have range of a
1225   // full double exp2 implementation.
1226     if (x == 0.0) return 1.0;
1227 
1228     // separate x into fractional and integer parts
1229     double i = reference_rint(x); // round to nearest integer
1230 
1231     if (i < -150) return 0.0;
1232 
1233     if (i > 129) return INFINITY;
1234 
1235     double f = x - i; // -0.5 <= f <= 0.5
1236 
1237     // find exp2(f)
1238     // calculate as p(f) = (exp2(f)-1)/f
1239     //              exp2(f) = f * p(f) + 1
1240     // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
1241 
1242     double p = 0.693147180560184539289
1243         + (0.240226506955902863183
1244            + (0.055504108656833424373
1245               + (0.009618129212846484796
1246                  + (0.001333355902958566035
1247                     + (0.000154034191902497930
1248                        + (0.000015252317761038105
1249                           + (0.000001326283129417092
1250                              + 0.000000102593187638680 * f)
1251                               * f)
1252                            * f)
1253                         * f)
1254                      * f)
1255                   * f)
1256                * f)
1257             * f;
1258     f *= p;
1259     f += 1.0;
1260 
1261     // scale by 2 ** i
1262     union {
1263         cl_ulong u;
1264         double d;
1265     } u;
1266     int exponent = (int)i + 1023;
1267     u.u = (cl_ulong)exponent << 52;
1268 
1269     return f * u.d;
1270 }
1271 
1272 
reference_expm1(double x)1273 double reference_expm1(double x)
1274 { // Note: only suitable for verifying single precision. Doesn't have range of a
1275   // full double expm1 implementation. It is only accurate to 47 bits or less.
1276 
1277     // early out for small numbers and NaNs
1278     if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
1279 
1280     // early out for large negative numbers
1281     if (x < -130.0) return -1.0;
1282 
1283     // early out for large positive numbers
1284     if (x > 100.0) return INFINITY;
1285 
1286     // separate x into fractional and integer parts
1287     double i = reference_rint(x); // round to nearest integer
1288     double f = x - i; // -0.5 <= f <= 0.5
1289 
1290     // reduce f to the range -0.0625 .. f.. 0.0625
1291     int index = (int)(f * 16.0) + 8; // 0...16
1292 
1293     static const double reduction[17] = { -0.5,  -0.4375, -0.375, -0.3125,
1294                                           -0.25, -0.1875, -0.125, -0.0625,
1295                                           0.0,   +0.0625, +0.125, +0.1875,
1296                                           +0.25, +0.3125, +0.375, +0.4375,
1297                                           +0.5 };
1298 
1299 
1300     // exponentials[i] = expm1(reduction[i])
1301     static const double exponentials[17] = {
1302         HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
1303         HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
1304         HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
1305         HEX_DBL(-, 1, 12d35a41ba104, -, 2),
1306         HEX_DBL(-, 1, c5041854df7d4, -, 3),
1307         HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
1308         HEX_DBL(-, 1, e14aed893eef4, -, 4),
1309         HEX_DBL(-, 1, f0540438fd5c3, -, 5),
1310         HEX_DBL(+, 0, 0, +, 0),
1311         HEX_DBL(+, 1, 082b577d34ed8, -, 4),
1312         HEX_DBL(+, 1, 10b022db7ae68, -, 3),
1313         HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
1314         HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
1315         HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
1316         HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
1317         HEX_DBL(+, 1, 190048ef6002, -, 1),
1318         HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
1319     };
1320 
1321 
1322     f -= reduction[index];
1323 
1324     // find expm1(f)
1325     // calculate as p(f) = (exp(f)-1)/f
1326     //              expm1(f) = f * p(f)
1327     // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
1328     // the range +- 0.0625
1329     double p = 0.999999999999998001599
1330         + (0.499999999999839628284
1331            + (0.166666666672817459505
1332               + (0.041666666612283048687
1333                  + (0.008333330214567431435
1334                     + (0.001389005319303770070 + 0.000198833381525156667 * f)
1335                         * f)
1336                      * f)
1337                   * f)
1338                * f)
1339             * f;
1340     f *= p; // expm1( reduced f )
1341 
1342     // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
1343     //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
1344     //          exponentials[index] + 1 -1 =  exmp1( reduced_f) *
1345     //          exponentials[index] + exmp1( reduced_f) + exponentials[index]
1346     f += exponentials[index] + f * exponentials[index];
1347 
1348     // scale by e ** i
1349     int exponent = (int)i;
1350     if (0 == exponent) return f; // precise answer for x near 1
1351 
1352     // table of e**(i-150)
1353     static const double exp_table[128 + 150 + 1] = {
1354         HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
1355         HEX_DBL(+, 1, 06e9996332ba1, -, 215),
1356         HEX_DBL(+, 1, 6555cb289e44b, -, 214),
1357         HEX_DBL(+, 1, e5ab364643354, -, 213),
1358         HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
1359         HEX_DBL(+, 1, c094499cc578e, -, 210),
1360         HEX_DBL(+, 1, 30d759323998c, -, 208),
1361         HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
1362         HEX_DBL(+, 1, 198fa3f30be25, -, 205),
1363         HEX_DBL(+, 1, 7eae636d6144e, -, 204),
1364         HEX_DBL(+, 1, 040f1036f4863, -, 202),
1365         HEX_DBL(+, 1, 6174e477a895f, -, 201),
1366         HEX_DBL(+, 1, e065b82dd95a, -, 200),
1367         HEX_DBL(+, 1, 4676be491d129, -, 198),
1368         HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
1369         HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
1370         HEX_DBL(+, 1, 99d3397ab8371, -, 194),
1371         HEX_DBL(+, 1, 1681497ed15b3, -, 192),
1372         HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
1373         HEX_DBL(+, 1, 013c74edba307, -, 189),
1374         HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
1375         HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
1376         HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
1377         HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
1378         HEX_DBL(+, 1, 2a42764857b19, -, 182),
1379         HEX_DBL(+, 1, 9560792d19314, -, 181),
1380         HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
1381         HEX_DBL(+, 1, 766b45dd84f18, -, 178),
1382         HEX_DBL(+, 1, fce362fe6e7d, -, 177),
1383         HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
1384         HEX_DBL(+, 1, d606847fc727a, -, 174),
1385         HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
1386         HEX_DBL(+, 1, b2216c6efdac1, -, 171),
1387         HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
1388         HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
1389         HEX_DBL(+, 1, 107df698da211, -, 166),
1390         HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
1391         HEX_DBL(+, 1, f75d6040aeff6, -, 164),
1392         HEX_DBL(+, 1, 56126259e093c, -, 162),
1393         HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
1394         HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
1395         HEX_DBL(+, 1, ad6b22f55db42, -, 158),
1396         HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
1397         HEX_DBL(+, 1, 8c9feab89b876, -, 155),
1398         HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
1399         HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
1400         HEX_DBL(+, 1, f1e6b68529e33, -, 151),
1401         HEX_DBL(+, 1, 525be4e4e601d, -, 149),
1402         HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
1403         HEX_DBL(+, 1, 3884e838aea68, -, 146),
1404         HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
1405         HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
1406         HEX_DBL(+, 1, 8851d84118908, -, 142),
1407         HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
1408         HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
1409         HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
1410         HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
1411         HEX_DBL(+, 1, c6e2d05bbc, -, 135),
1412         HEX_DBL(+, 1, 35208867c2683, -, 133),
1413         HEX_DBL(+, 1, a425b317eeacd, -, 132),
1414         HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
1415         HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
1416         HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
1417         HEX_DBL(+, 1, 666d0dad2961d, -, 126),
1418         HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
1419         HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
1420         HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
1421         HEX_DBL(+, 1, 31c5957a47de2, -, 120),
1422         HEX_DBL(+, 1, 9f96445648b9f, -, 119),
1423         HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
1424         HEX_DBL(+, 1, 7fd974d372e45, -, 116),
1425         HEX_DBL(+, 1, 04da4d1452919, -, 114),
1426         HEX_DBL(+, 1, 62891f06b345, -, 113),
1427         HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
1428         HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
1429         HEX_DBL(+, 1, bd109d9d94bda, -, 109),
1430         HEX_DBL(+, 1, 2e73f53fba844, -, 107),
1431         HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
1432         HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
1433         HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
1434         HEX_DBL(+, 1, 02057d1245ceb, -, 101),
1435         HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
1436         HEX_DBL(+, 1, dca23bae16424, -, 99),
1437         HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
1438         HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
1439         HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
1440         HEX_DBL(+, 1, 969d47321e4cc, -, 93),
1441         HEX_DBL(+, 1, 1452b7723aed2, -, 91),
1442         HEX_DBL(+, 1, 778fe2497184c, -, 90),
1443         HEX_DBL(+, 1, fe7116182e9cc, -, 89),
1444         HEX_DBL(+, 1, 5ae191a99585a, -, 87),
1445         HEX_DBL(+, 1, d775d87da854d, -, 86),
1446         HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
1447         HEX_DBL(+, 1, b374b315f87c1, -, 83),
1448         HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
1449         HEX_DBL(+, 1, 923372c67a074, -, 80),
1450         HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
1451         HEX_DBL(+, 1, 737c5645114b5, -, 77),
1452         HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
1453         HEX_DBL(+, 1, 571db733a9d61, -, 74),
1454         HEX_DBL(+, 1, d257d547e083f, -, 73),
1455         HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
1456         HEX_DBL(+, 1, aebabae3a41b5, -, 70),
1457         HEX_DBL(+, 1, 24b6031b49bda, -, 68),
1458         HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
1459         HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
1460         HEX_DBL(+, 1, 6f741de1748ec, -, 64),
1461         HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
1462         HEX_DBL(+, 1, 536452ee2f75c, -, 61),
1463         HEX_DBL(+, 1, cd480a1b7482, -, 60),
1464         HEX_DBL(+, 1, 39792499b1a24, -, 58),
1465         HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
1466         HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
1467         HEX_DBL(+, 1, 898471fca6055, -, 54),
1468         HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
1469         HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
1470         HEX_DBL(+, 1, ee001eed62aa, -, 50),
1471         HEX_DBL(+, 1, 4fb547c775da8, -, 48),
1472         HEX_DBL(+, 1, c8464f7616468, -, 47),
1473         HEX_DBL(+, 1, 36121e24d3bba, -, 45),
1474         HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
1475         HEX_DBL(+, 1, 1e642baeb84a, -, 42),
1476         HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
1477         HEX_DBL(+, 1, 0885298767e9a, -, 39),
1478         HEX_DBL(+, 1, 67852a7007e42, -, 38),
1479         HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
1480         HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
1481         HEX_DBL(+, 1, c3527e433fab1, -, 34),
1482         HEX_DBL(+, 1, 32b48bf117da2, -, 32),
1483         HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
1484         HEX_DBL(+, 1, 1b48655f37267, -, 29),
1485         HEX_DBL(+, 1, 81056ff2c5772, -, 28),
1486         HEX_DBL(+, 1, 05a628c699fa1, -, 26),
1487         HEX_DBL(+, 1, 639e3175a689d, -, 25),
1488         HEX_DBL(+, 1, e355bbaee85cb, -, 24),
1489         HEX_DBL(+, 1, 4875ca227ec38, -, 22),
1490         HEX_DBL(+, 1, be6c6fdb01612, -, 21),
1491         HEX_DBL(+, 1, 2f6053b981d98, -, 19),
1492         HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
1493         HEX_DBL(+, 1, 18354238f6764, -, 16),
1494         HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
1495         HEX_DBL(+, 1, 02cf22526545a, -, 13),
1496         HEX_DBL(+, 1, 5fc21041027ad, -, 12),
1497         HEX_DBL(+, 1, de16b9c24a98f, -, 11),
1498         HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
1499         HEX_DBL(+, 1, b993fe00d5376, -, 8),
1500         HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
1501         HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
1502         HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
1503         HEX_DBL(+, 1, 78b56362cef38, -, 2),
1504         HEX_DBL(+, 1, 0, +, 0),
1505         HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
1506         HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
1507         HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
1508         HEX_DBL(+, 1, b4c902e273a58, +, 5),
1509         HEX_DBL(+, 1, 28d389970338f, +, 7),
1510         HEX_DBL(+, 1, 936dc5690c08f, +, 8),
1511         HEX_DBL(+, 1, 122885aaeddaa, +, 10),
1512         HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
1513         HEX_DBL(+, 1, fa7157c470f82, +, 12),
1514         HEX_DBL(+, 1, 5829dcf95056, +, 14),
1515         HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
1516         HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
1517         HEX_DBL(+, 1, b00b5916ac955, +, 18),
1518         HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
1519         HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
1520         HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
1521         HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
1522         HEX_DBL(+, 1, f4f22091940bd, +, 25),
1523         HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
1524         HEX_DBL(+, 1, ceb088b68e804, +, 28),
1525         HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
1526         HEX_DBL(+, 1, ab5adb9c436, +, 31),
1527         HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
1528         HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
1529         HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
1530         HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
1531         HEX_DBL(+, 1, ef822f7f6731d, +, 38),
1532         HEX_DBL(+, 1, 50bba3796379a, +, 40),
1533         HEX_DBL(+, 1, c9aae4631c056, +, 41),
1534         HEX_DBL(+, 1, 370470aec28ed, +, 43),
1535         HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
1536         HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
1537         HEX_DBL(+, 1, 866f34a725782, +, 47),
1538         HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
1539         HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
1540         HEX_DBL(+, 1, ea215a1d20d76, +, 51),
1541         HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
1542         HEX_DBL(+, 1, c4b334617cc67, +, 54),
1543         HEX_DBL(+, 1, 33a43d282a519, +, 56),
1544         HEX_DBL(+, 1, a220d397972eb, +, 57),
1545         HEX_DBL(+, 1, 1c25c88df6862, +, 59),
1546         HEX_DBL(+, 1, 8232558201159, +, 60),
1547         HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
1548         HEX_DBL(+, 1, 64b41c6d37832, +, 63),
1549         HEX_DBL(+, 1, e4cf766fe49be, +, 64),
1550         HEX_DBL(+, 1, 49767bc0483e3, +, 66),
1551         HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
1552         HEX_DBL(+, 1, 304d6aeca254b, +, 69),
1553         HEX_DBL(+, 1, 9d97010884251, +, 70),
1554         HEX_DBL(+, 1, 19103e4080b45, +, 72),
1555         HEX_DBL(+, 1, 7e013cd114461, +, 73),
1556         HEX_DBL(+, 1, 03996528e074c, +, 75),
1557         HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
1558         HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
1559         HEX_DBL(+, 1, 45e3076d61699, +, 79),
1560         HEX_DBL(+, 1, baed16a6e0da7, +, 80),
1561         HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
1562         HEX_DBL(+, 1, 9919cabefcb69, +, 83),
1563         HEX_DBL(+, 1, 160345c9953e3, +, 85),
1564         HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
1565         HEX_DBL(+, 1, 00c810d464097, +, 88),
1566         HEX_DBL(+, 1, 5d009394c5c27, +, 89),
1567         HEX_DBL(+, 1, da57de8f107a8, +, 90),
1568         HEX_DBL(+, 1, 425982cf597cd, +, 92),
1569         HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
1570         HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
1571         HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
1572         HEX_DBL(+, 1, 12fec759586fd, +, 98),
1573         HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
1574         HEX_DBL(+, 1, fbfd219c43b04, +, 100),
1575         HEX_DBL(+, 1, 5936d44e1a146, +, 102),
1576         HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
1577         HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
1578         HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
1579         HEX_DBL(+, 1, 268038c2c0e, +, 108),
1580         HEX_DBL(+, 1, 9044a73545d48, +, 109),
1581         HEX_DBL(+, 1, 1002ab6218b38, +, 111),
1582         HEX_DBL(+, 1, 71b3540cbf921, +, 112),
1583         HEX_DBL(+, 1, f6799ea9c414a, +, 113),
1584         HEX_DBL(+, 1, 55779b984f3eb, +, 115),
1585         HEX_DBL(+, 1, d01a210c44aa4, +, 116),
1586         HEX_DBL(+, 1, 3b63da8e9121, +, 118),
1587         HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
1588         HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
1589         HEX_DBL(+, 1, 8bec7503ca477, +, 122),
1590         HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
1591         HEX_DBL(+, 1, 6db0118477245, +, 125),
1592         HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
1593         HEX_DBL(+, 1, 51c2cc3433801, +, 128),
1594         HEX_DBL(+, 1, cb108ffbec164, +, 129),
1595         HEX_DBL(+, 1, 37f780991b584, +, 131),
1596         HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
1597         HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
1598         HEX_DBL(+, 1, 87a0553328015, +, 135),
1599         HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
1600         HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
1601         HEX_DBL(+, 1, eba064644060a, +, 139),
1602         HEX_DBL(+, 1, 4e184933d9364, +, 141),
1603         HEX_DBL(+, 1, c614fe2531841, +, 142),
1604         HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
1605         HEX_DBL(+, 1, a36798b9d969b, +, 145),
1606         HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
1607         HEX_DBL(+, 1, 836026385c974, +, 148),
1608         HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
1609         HEX_DBL(+, 1, 65cae0969f286, +, 151),
1610         HEX_DBL(+, 1, e64a58639cae8, +, 152),
1611         HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
1612         HEX_DBL(+, 1, c12744a3a28e3, +, 155),
1613         HEX_DBL(+, 1, 313b3b6978e85, +, 157),
1614         HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
1615         HEX_DBL(+, 1, 19ebe56b56453, +, 160),
1616         HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
1617         HEX_DBL(+, 1, 04644610df2ff, +, 163),
1618         HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
1619         HEX_DBL(+, 1, e103201f299b3, +, 165),
1620         HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
1621         HEX_DBL(+, 1, bc473cfede104, +, 168),
1622         HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
1623         HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
1624         HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
1625         HEX_DBL(+, 1, 7b03166942309, +, 174),
1626         HEX_DBL(+, 1, 0190be03150a7, +, 176),
1627         HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
1628         HEX_DBL(+, 1, dbca9263f8487, +, 178),
1629         HEX_DBL(+, 1, 43556dee93bee, +, 180),
1630         HEX_DBL(+, 1, b774c12967dfa, +, 181),
1631         HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
1632         HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
1633     };
1634 
1635     // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 =
1636     // e**i
1637     return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
1638 }
1639 
1640 
reference_fmax(double x,double y)1641 double reference_fmax(double x, double y)
1642 {
1643     if (isnan(y)) return x;
1644 
1645     return x >= y ? x : y;
1646 }
1647 
reference_fmin(double x,double y)1648 double reference_fmin(double x, double y)
1649 {
1650     if (isnan(y)) return x;
1651 
1652     return x <= y ? x : y;
1653 }
1654 
reference_hypot(double x,double y)1655 double reference_hypot(double x, double y)
1656 {
1657     // Since the inputs are actually floats, we don't have to worry about range
1658     // here
1659     if (isinf(x) || isinf(y)) return INFINITY;
1660 
1661     return sqrt(x * x + y * y);
1662 }
1663 
reference_ilogbl(long double x)1664 int reference_ilogbl(long double x)
1665 {
1666     extern int gDeviceILogb0, gDeviceILogbNaN;
1667 
1668     // Since we are just using this to verify double precision, we can
1669     // use the double precision ilogb here
1670     union {
1671         double f;
1672         cl_ulong u;
1673     } u;
1674     u.f = (double)x;
1675 
1676     int exponent = (int)(u.u >> 52) & 0x7ff;
1677     if (exponent == 0x7ff)
1678     {
1679         if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
1680 
1681         return CL_INT_MAX;
1682     }
1683 
1684     if (exponent == 0)
1685     { // deal with denormals
1686         u.f = x * HEX_DBL(+, 1, 0, +, 64);
1687         exponent = (cl_uint)(u.u >> 52) & 0x7ff;
1688         if (exponent == 0) return gDeviceILogb0;
1689 
1690         exponent -= 1023 + 64;
1691         return exponent;
1692     }
1693 
1694     return exponent - 1023;
1695 }
1696 
reference_relaxed_log2(double x)1697 double reference_relaxed_log2(double x) { return reference_log2(x); }
1698 
reference_log2(double x)1699 double reference_log2(double x)
1700 {
1701     if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
1702 
1703     if (x == 0.0f) return -INFINITY;
1704 
1705     if (x == INFINITY) return INFINITY;
1706 
1707     double hi, lo;
1708     __log2_ep(&hi, &lo, x);
1709     return hi;
1710 }
1711 
reference_log1p(double x)1712 double reference_log1p(double x)
1713 { // This function is suitable only for verifying log1pf(). It produces several
1714   // double precision ulps of error.
1715 
1716     // Handle small and NaN
1717     if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
1718 
1719     // deal with special values
1720     if (x <= -1.0)
1721     {
1722         if (x < -1.0) return cl_make_nan();
1723         return -INFINITY;
1724     }
1725 
1726     // infinity
1727     if (x == INFINITY) return INFINITY;
1728 
1729     // High precision result for when near 0, to avoid problems with the
1730     // reference result falling in the wrong binade.
1731     if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
1732 
1733     // Our polynomial is only good in the region +-2**-4.
1734     // If we aren't in that range then we need to reduce to be in that range
1735     double correctionLo =
1736         -0.0; // correction down stream to compensate for the reduction, if any
1737     double correctionHi =
1738         -0.0; // correction down stream to compensate for the exponent, if any
1739     if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
1740     {
1741         x += 1.0; // double should cover any loss of precision here
1742 
1743         // separate x into (1+f) * 2**i
1744         union {
1745             double d;
1746             cl_ulong u;
1747         } u;
1748         u.d = x;
1749         int i = (int)((u.u >> 52) & 0x7ff) - 1023;
1750         u.u &= 0x000fffffffffffffULL;
1751         int index = (int)(u.u >> 48);
1752         u.u |= 0x3ff0000000000000ULL;
1753         double f = u.d;
1754 
1755         // further reduce f to be within 1/16 of 1.0
1756         static const double scale_table[16] = {
1757             1.0,
1758             HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
1759             HEX_DBL(+, 1, b8e38e42737a1, -, 1),
1760             HEX_DBL(+, 1, a1af28711adf3, -, 1),
1761             HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
1762             HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
1763             HEX_DBL(+, 1, 68ba2e94df404, -, 1),
1764             HEX_DBL(+, 1, 590b216defb29, -, 1),
1765             HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
1766             HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
1767             HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
1768             HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
1769             HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
1770             HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
1771             HEX_DBL(+, 1, 0888888e46fea, -, 1),
1772             HEX_DBL(+, 1, 00000038e9862, -, 1)
1773         };
1774 
1775         // correction_table[i] = -log( scale_table[i] )
1776         // All entries have >= 64 bits of precision (rather than the expected
1777         // 53)
1778         static const double correction_table[16] = {
1779             -0.0,
1780             HEX_DBL(+, 1, 7a5c722c16058, -, 4),
1781             HEX_DBL(+, 1, 323db16c89ab1, -, 3),
1782             HEX_DBL(+, 1, a0f87d180629, -, 3),
1783             HEX_DBL(+, 1, 050279324e17c, -, 2),
1784             HEX_DBL(+, 1, 36f885bb270b0, -, 2),
1785             HEX_DBL(+, 1, 669b771b5cc69, -, 2),
1786             HEX_DBL(+, 1, 94203a6292a05, -, 2),
1787             HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
1788             HEX_DBL(+, 1, e982376ddb80e, -, 2),
1789             HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
1790             HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
1791             HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
1792             HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
1793             HEX_DBL(+, 1, 521a5c0330615, -, 1),
1794             HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
1795         };
1796 
1797         f *= scale_table[index];
1798         correctionLo = correction_table[index];
1799 
1800         // log( 2**(i) ) = i * log(2)
1801         correctionHi = (double)i * 0.693147180559945309417232121458176568;
1802 
1803         x = f - 1.0;
1804     }
1805 
1806 
1807     // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
1808     // [-1/16, 1/16]
1809     //          max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
1810     double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
1811         + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
1812            + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
1813               + (HEX_DBL(-, 1, 00000062c78, -, 2)
1814                  + (HEX_DBL(+, 1, 9999958a3321, -, 3)
1815                     + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
1816                        + (HEX_DBL(+, 1, 24957208391a5, -, 3)
1817                           + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
1818                              + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
1819                               * x)
1820                            * x)
1821                         * x)
1822                      * x)
1823                   * x)
1824                * x)
1825             * x;
1826 
1827     // log(x+1) = x * p(x) + x
1828     x += x * p;
1829 
1830     return correctionHi + (correctionLo + x);
1831 }
1832 
reference_logb(double x)1833 double reference_logb(double x)
1834 {
1835     union {
1836         float f;
1837         cl_uint u;
1838     } u;
1839     u.f = (float)x;
1840 
1841     cl_int exponent = (u.u >> 23) & 0xff;
1842     if (exponent == 0xff) return x * x;
1843 
1844     if (exponent == 0)
1845     { // deal with denormals
1846         u.u = (u.u & 0x007fffff) | 0x3f800000;
1847         u.f -= 1.0f;
1848         exponent = (u.u >> 23) & 0xff;
1849         if (exponent == 0) return -INFINITY;
1850 
1851         return exponent - (127 + 126);
1852     }
1853 
1854     return exponent - 127;
1855 }
1856 
reference_relaxed_reciprocal(double x)1857 double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
1858 
reference_reciprocal(double x)1859 double reference_reciprocal(double x) { return 1.0 / x; }
1860 
reference_remainder(double x,double y)1861 double reference_remainder(double x, double y)
1862 {
1863     int i;
1864     return reference_remquo(x, y, &i);
1865 }
1866 
reference_lgamma(double x)1867 double reference_lgamma(double x)
1868 {
1869     /*
1870      * ====================================================
1871      * This function is from fdlibm. http://www.netlib.org
1872      * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1873      *
1874      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1875      * Permission to use, copy, modify, and distribute this
1876      * software is freely granted, provided that this notice
1877      * is preserved.
1878      * ====================================================
1879      *
1880      */
1881 
1882     static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
1883                         // 0x00000000 */
1884         half = 5.00000000000000000000e-01, /* 0x3FE00000,
1885                                               0x00000000 */
1886         one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
1887         pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
1888         a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
1889         a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
1890         a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
1891         a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
1892         a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
1893         a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
1894         a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
1895         a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
1896         a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
1897         a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
1898         a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
1899         a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
1900         tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
1901         tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
1902         /* tt = -(tail of tf) */
1903         tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
1904         t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
1905         t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
1906         t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
1907         t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
1908         t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
1909         t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
1910         t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
1911         t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
1912         t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
1913         t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
1914         t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
1915         t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
1916         t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
1917         t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
1918         t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
1919         u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1920         u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
1921         u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
1922         u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
1923         u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
1924         u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
1925         v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
1926         v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
1927         v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
1928         v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
1929         v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
1930         s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1931         s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
1932         s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
1933         s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
1934         s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
1935         s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
1936         s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
1937         r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
1938         r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
1939         r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
1940         r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
1941         r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
1942         r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
1943         w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
1944         w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
1945         w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
1946         w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
1947         w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
1948         w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
1949         w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
1950 
1951     static const double zero = 0.00000000000000000000e+00;
1952     double t, y, z, nadj, p, p1, p2, p3, q, r, w;
1953     cl_int i, hx, lx, ix;
1954 
1955     union {
1956         double d;
1957         cl_ulong u;
1958     } u;
1959     u.d = x;
1960 
1961     hx = (cl_int)(u.u >> 32);
1962     lx = (cl_int)(u.u & 0xffffffffULL);
1963 
1964     /* purge off +-inf, NaN, +-0, and negative arguments */
1965     //    *signgamp = 1;
1966     ix = hx & 0x7fffffff;
1967     if (ix >= 0x7ff00000) return x * x;
1968     if ((ix | lx) == 0) return INFINITY;
1969     if (ix < 0x3b900000)
1970     { /* |x|<2**-70, return -log(|x|) */
1971         if (hx < 0)
1972         {
1973             //            *signgamp = -1;
1974             return -reference_log(-x);
1975         }
1976         else
1977             return -reference_log(x);
1978     }
1979     if (hx < 0)
1980     {
1981         if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
1982             return INFINITY;
1983         t = reference_sinpi(x);
1984         if (t == zero) return INFINITY; /* -integer */
1985         nadj = reference_log(pi / reference_fabs(t * x));
1986         //        if(t<zero) *signgamp = -1;
1987         x = -x;
1988     }
1989 
1990     /* purge off 1 and 2 */
1991     if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
1992         r = 0;
1993     /* for x < 2.0 */
1994     else if (ix < 0x40000000)
1995     {
1996         if (ix <= 0x3feccccc)
1997         { /* lgamma(x) = lgamma(x+1)-log(x) */
1998             r = -reference_log(x);
1999             if (ix >= 0x3FE76944)
2000             {
2001                 y = 1.0 - x;
2002                 i = 0;
2003             }
2004             else if (ix >= 0x3FCDA661)
2005             {
2006                 y = x - (tc - one);
2007                 i = 1;
2008             }
2009             else
2010             {
2011                 y = x;
2012                 i = 2;
2013             }
2014         }
2015         else
2016         {
2017             r = zero;
2018             if (ix >= 0x3FFBB4C3)
2019             {
2020                 y = 2.0 - x;
2021                 i = 0;
2022             } /* [1.7316,2] */
2023             else if (ix >= 0x3FF3B4C4)
2024             {
2025                 y = x - tc;
2026                 i = 1;
2027             } /* [1.23,1.73] */
2028             else
2029             {
2030                 y = x - one;
2031                 i = 2;
2032             }
2033         }
2034         switch (i)
2035         {
2036             case 0:
2037                 z = y * y;
2038                 p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
2039                 p2 = z
2040                     * (a1
2041                        + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
2042                 p = y * p1 + p2;
2043                 r += (p - 0.5 * y);
2044                 break;
2045             case 1:
2046                 z = y * y;
2047                 w = z * y;
2048                 p1 = t0
2049                     + w
2050                         * (t3
2051                            + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
2052                 p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
2053                 p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
2054                 p = z * p1 - (tt - w * (p2 + y * p3));
2055                 r += (tf + p);
2056                 break;
2057             case 2:
2058                 p1 = y
2059                     * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
2060                 p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
2061                 r += (-0.5 * y + p1 / p2);
2062         }
2063     }
2064     else if (ix < 0x40200000)
2065     { /* x < 8.0 */
2066         i = (int)x;
2067         t = zero;
2068         y = x - (double)i;
2069         p = y
2070             * (s0
2071                + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
2072         q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
2073         r = half * y + p / q;
2074         z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
2075         switch (i)
2076         {
2077             case 7: z *= (y + 6.0); /* FALLTHRU */
2078             case 6: z *= (y + 5.0); /* FALLTHRU */
2079             case 5: z *= (y + 4.0); /* FALLTHRU */
2080             case 4: z *= (y + 3.0); /* FALLTHRU */
2081             case 3:
2082                 z *= (y + 2.0); /* FALLTHRU */
2083                 r += reference_log(z);
2084                 break;
2085         }
2086         /* 8.0 <= x < 2**58 */
2087     }
2088     else if (ix < 0x43900000)
2089     {
2090         t = reference_log(x);
2091         z = one / x;
2092         y = z * z;
2093         w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
2094         r = (x - half) * (t - one) + w;
2095     }
2096     else
2097         /* 2**58 <= x <= inf */
2098         r = x * (reference_log(x) - one);
2099     if (hx < 0) r = nadj - r;
2100     return r;
2101 }
2102 
2103 #endif // _MSC_VER
2104 
reference_assignment(double x)2105 double reference_assignment(double x) { return x; }
2106 
reference_not(double x)2107 int reference_not(double x)
2108 {
2109     int r = !x;
2110     return r;
2111 }
2112 
2113 #pragma mark -
2114 #pragma mark Double testing
2115 
2116 #ifndef M_PIL
2117 #define M_PIL                                                                  \
2118     3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
2119 #endif
2120 
2121 static long double reduce1l(long double x);
2122 
2123 #ifdef __PPC__
2124 // Since long double on PPC is really extended precision double arithmetic
2125 // consisting of two doubles (a high and low). This form of long double has
2126 // the potential of representing a number with more than LDBL_MANT_DIG digits
2127 // such that reduction algorithm used for other architectures will not work.
2128 // Instead and alternate reduction method is used.
2129 
reduce1l(long double x)2130 static long double reduce1l(long double x)
2131 {
2132     union {
2133         long double ld;
2134         double d[2];
2135     } u;
2136 
2137     // Reduce the high and low halfs separately.
2138     u.ld = x;
2139     return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
2140 }
2141 
2142 #else // !__PPC__
2143 
reduce1l(long double x)2144 static long double reduce1l(long double x)
2145 {
2146     static long double unit_exp = 0;
2147     if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
2148 
2149     if (reference_fabsl(x) >= unit_exp)
2150     {
2151         if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2152 
2153         return 0.0L; // we patch up the sign for sinPi and cosPi later, since
2154                      // they need different signs
2155     }
2156 
2157     // Find the nearest multiple of 2
2158     const long double r = reference_copysignl(unit_exp, x);
2159     long double z = x + r;
2160     z -= r;
2161 
2162     // subtract it from x. Value is now in the range -1 <= x <= 1
2163     return x - z;
2164 }
2165 #endif // __PPC__
2166 
reference_acospil(long double x)2167 long double reference_acospil(long double x)
2168 {
2169     return reference_acosl(x) / M_PIL;
2170 }
reference_asinpil(long double x)2171 long double reference_asinpil(long double x)
2172 {
2173     return reference_asinl(x) / M_PIL;
2174 }
reference_atanpil(long double x)2175 long double reference_atanpil(long double x)
2176 {
2177     return reference_atanl(x) / M_PIL;
2178 }
reference_atan2pil(long double y,long double x)2179 long double reference_atan2pil(long double y, long double x)
2180 {
2181     return reference_atan2l(y, x) / M_PIL;
2182 }
reference_cospil(long double x)2183 long double reference_cospil(long double x)
2184 {
2185     if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
2186     {
2187         if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2188 
2189         // Note this probably fails for odd values between 0x1.0p52 and
2190         // 0x1.0p53. However, when starting with single precision inputs, there
2191         // will be no odd values.
2192 
2193         return 1.0L;
2194     }
2195 
2196     x = reduce1l(x);
2197 
2198 #if DBL_MANT_DIG >= LDBL_MANT_DIG
2199 
2200     // phase adjust
2201     double xhi = 0.0;
2202     double xlo = 0.0;
2203     xhi = (double)x + 0.5;
2204 
2205     if (reference_fabsl(x) > 0.5L)
2206     {
2207         xlo = xhi - x;
2208         xlo = 0.5 - xlo;
2209     }
2210     else
2211     {
2212         xlo = xhi - 0.5;
2213         xlo = x - xlo;
2214     }
2215 
2216     // reduce to [-0.5, 0.5]
2217     if (xhi < -0.5)
2218     {
2219         xhi = -1.0 - xhi;
2220         xlo = -xlo;
2221     }
2222     else if (xhi > 0.5)
2223     {
2224         xhi = 1.0 - xhi;
2225         xlo = -xlo;
2226     }
2227 
2228     // cosPi zeros are all +0
2229     if (xhi == 0.0 && xlo == 0.0) return 0.0;
2230 
2231     xhi *= M_PI;
2232     xlo *= M_PI;
2233 
2234     xhi += xlo;
2235 
2236     return reference_sinl(xhi);
2237 
2238 #else
2239     // phase adjust
2240     x += 0.5L;
2241 
2242     // reduce to [-0.5, 0.5]
2243     if (x < -0.5L)
2244         x = -1.0L - x;
2245     else if (x > 0.5L)
2246         x = 1.0L - x;
2247 
2248     // cosPi zeros are all +0
2249     if (x == 0.0L) return 0.0L;
2250 
2251     return reference_sinl(x * M_PIL);
2252 #endif
2253 }
2254 
reference_dividel(long double x,long double y)2255 long double reference_dividel(long double x, long double y)
2256 {
2257     double dx = x;
2258     double dy = y;
2259     return dx / dy;
2260 }
2261 
2262 typedef struct
2263 {
2264     double hi, lo;
2265 } double_double;
2266 
2267 // Split doubles_double into a series of consecutive 26-bit precise doubles and
2268 // a remainder. Note for later -- for multiplication, it might be better to
2269 // split each double into a power of two and two 26 bit portions
2270 //                      multiplication of a double double by a known power of
2271 //                      two is cheap. The current approach causes some inexact
2272 //                      arithmetic in mul_dd.
split_dd(double_double x,double_double * hi,double_double * lo)2273 static inline void split_dd(double_double x, double_double *hi,
2274                             double_double *lo)
2275 {
2276     union {
2277         double d;
2278         cl_ulong u;
2279     } u;
2280     u.d = x.hi;
2281     u.u &= 0xFFFFFFFFF8000000ULL;
2282     hi->hi = u.d;
2283     x.hi -= u.d;
2284 
2285     u.d = x.hi;
2286     u.u &= 0xFFFFFFFFF8000000ULL;
2287     hi->lo = u.d;
2288     x.hi -= u.d;
2289 
2290     double temp = x.hi;
2291     x.hi += x.lo;
2292     x.lo -= x.hi - temp;
2293     u.d = x.hi;
2294     u.u &= 0xFFFFFFFFF8000000ULL;
2295     lo->hi = u.d;
2296     x.hi -= u.d;
2297 
2298     lo->lo = x.hi + x.lo;
2299 }
2300 
accum_d(double_double a,double b)2301 static inline double_double accum_d(double_double a, double b)
2302 {
2303     double temp;
2304     if (fabs(b) > fabs(a.hi))
2305     {
2306         temp = a.hi;
2307         a.hi += b;
2308         a.lo += temp - (a.hi - b);
2309     }
2310     else
2311     {
2312         temp = a.hi;
2313         a.hi += b;
2314         a.lo += b - (a.hi - temp);
2315     }
2316 
2317     if (isnan(a.lo)) a.lo = 0.0;
2318 
2319     return a;
2320 }
2321 
add_dd(double_double a,double_double b)2322 static inline double_double add_dd(double_double a, double_double b)
2323 {
2324     double_double r = { -0.0 - 0.0 };
2325 
2326     if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
2327         || 0.0 == b.hi)
2328     {
2329         r.hi = a.hi + b.hi;
2330         r.lo = a.lo + b.lo;
2331         if (isnan(r.lo)) r.lo = 0.0;
2332         return r;
2333     }
2334 
2335     // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
2336     // |b.hi| > |b.lo|, so we don't have to do the first merge pass
2337     double terms[4] = { a.hi, b.hi, a.lo, b.lo };
2338     double temp;
2339 
2340     // Sort hi terms
2341     if (fabs(terms[0]) < fabs(terms[1]))
2342     {
2343         temp = terms[0];
2344         terms[0] = terms[1];
2345         terms[1] = temp;
2346     }
2347     // sort lo terms
2348     if (fabs(terms[2]) < fabs(terms[3]))
2349     {
2350         temp = terms[2];
2351         terms[2] = terms[3];
2352         terms[3] = temp;
2353     }
2354     // Fix case where small high term is less than large low term
2355     if (fabs(terms[1]) < fabs(terms[2]))
2356     {
2357         temp = terms[1];
2358         terms[1] = terms[2];
2359         terms[2] = temp;
2360     }
2361 
2362     // accumulate the results
2363     r.hi = terms[2] + terms[3];
2364     r.lo = terms[3] - (r.hi - terms[2]);
2365 
2366     temp = r.hi;
2367     r.hi += terms[1];
2368     r.lo += temp - (r.hi - terms[1]);
2369 
2370     temp = r.hi;
2371     r.hi += terms[0];
2372     r.lo += temp - (r.hi - terms[0]);
2373 
2374     // canonicalize the result
2375     temp = r.hi;
2376     r.hi += r.lo;
2377     r.lo = r.lo - (r.hi - temp);
2378     if (isnan(r.lo)) r.lo = 0.0;
2379 
2380     return r;
2381 }
2382 
mul_dd(double_double a,double_double b)2383 static inline double_double mul_dd(double_double a, double_double b)
2384 {
2385     double_double result = { -0.0, -0.0 };
2386 
2387     // Inf, nan and 0
2388     if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
2389         || 0.0 == b.hi)
2390     {
2391         result.hi = a.hi * b.hi;
2392         return result;
2393     }
2394 
2395     double_double ah, al, bh, bl;
2396     split_dd(a, &ah, &al);
2397     split_dd(b, &bh, &bl);
2398 
2399     double p0 = ah.hi * bh.hi; // exact    (52 bits in product) 0
2400     double p1 = ah.hi * bh.lo; // exact    (52 bits in product) 26
2401     double p2 = ah.lo * bh.hi; // exact    (52 bits in product) 26
2402     double p3 = ah.lo * bh.lo; // exact    (52 bits in product) 52
2403     double p4 = al.hi * bh.hi; // exact    (52 bits in product) 52
2404     double p5 = al.hi * bh.lo; // exact    (52 bits in product) 78
2405     double p6 = al.lo * bh.hi; // inexact  (54 bits in product) 78
2406     double p7 = al.lo * bh.lo; // inexact  (54 bits in product) 104
2407     double p8 = ah.hi * bl.hi; // exact    (52 bits in product) 52
2408     double p9 = ah.hi * bl.lo; // inexact  (54 bits in product) 78
2409     double pA = ah.lo * bl.hi; // exact    (52 bits in product) 78
2410     double pB = ah.lo * bl.lo; // inexact  (54 bits in product) 104
2411     double pC = al.hi * bl.hi; // exact    (52 bits in product) 104
2412     // the last 3 terms are two low to appear in the result
2413 
2414 
2415     // take advantage of the known relative magnitudes of the partial products
2416     // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
2417     // sloppy about canonicalizing the double_doubles
2418     double_double t0 = { pA, pC };
2419     double_double t1 = { p9, pB };
2420     double_double t2 = { p6, p7 };
2421     double temp0, temp1, temp2;
2422 
2423     t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
2424 
2425     // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
2426     // double_doubles
2427     temp0 = t0.hi;
2428     temp1 = t1.hi;
2429     temp2 = t2.hi;
2430     t0.hi += p3;
2431     t1.hi += p4;
2432     t2.hi += p8;
2433     temp0 -= t0.hi - p3;
2434     temp1 -= t1.hi - p4;
2435     temp2 -= t2.hi - p8;
2436     t0.lo += temp0;
2437     t1.lo += temp1;
2438     t2.lo += temp2;
2439 
2440     // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
2441     // double_doubles
2442     temp1 = t1.hi;
2443     temp2 = t2.hi;
2444     t1.hi += p1;
2445     t2.hi += p2;
2446     temp1 -= t1.hi - p1;
2447     temp2 -= t2.hi - p2;
2448     t1.lo += temp1;
2449     t2.lo += temp2;
2450 
2451     // Combine accumulators to get the low bits of result
2452     t1 = add_dd(t1, add_dd(t2, t0));
2453 
2454     // Add in MSB's, and round to precision
2455     return accum_d(t1, p0); // canonicalizes
2456 }
2457 
2458 
reference_exp10l(long double z)2459 long double reference_exp10l(long double z)
2460 {
2461     const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
2462                                     HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
2463     double_double x;
2464     int j;
2465 
2466     // Handle NaNs
2467     if (isnan(z)) return z;
2468 
2469     // init x
2470     x.hi = z;
2471     x.lo = z - x.hi;
2472 
2473 
2474     // 10**x = exp2( x * log2(10) )
2475 
2476     x = mul_dd(x, log2_10); // x * log2(10)
2477 
2478     // Deal with overflow and underflow for exp2(x) stage next
2479     if (x.hi >= 1025) return INFINITY;
2480 
2481     if (x.hi < -1075 - 24) return +0.0;
2482 
2483     // find nearest integer to x
2484     int i = (int)rint(x.hi);
2485 
2486     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
2487     x.hi -= i;
2488 
2489     // We could attempt to find a minimax polynomial for exp2(x) over the range
2490     // x = [-0.5, 0.5]. However, this would converge very slowly near the
2491     // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
2492     // requiring something like a 20th order polynomial to get 53 + 24 bits of
2493     // precision. Instead we further reduce the range to [-1/32, 1/32] by
2494     // observing that
2495     //
2496     //  2**(a+b) = 2**a * 2**b
2497     //
2498     // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
2499     // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
2500     // value of n/16 from x.
2501     const double_double corrections[17] = {
2502         { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
2503           HEX_DBL(-, 1, bdd3413b26456, -, 55) },
2504         { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
2505           HEX_DBL(-, 1, 41577ee04992f, -, 56) },
2506         { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
2507           HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
2508         { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
2509           HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
2510         { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
2511           HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
2512         { HEX_DBL(+, 1, c199bdd85529c, -, 1),
2513           HEX_DBL(+, 1, 11065895048dd, -, 56) },
2514         { HEX_DBL(+, 1, d5818dcfba487, -, 1),
2515           HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
2516         { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
2517           HEX_DBL(-, 1, e9c23179c2893, -, 55) },
2518         { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
2519         { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
2520           HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
2521         { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
2522           HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
2523         { HEX_DBL(+, 1, 2387a6e756238, +, 0),
2524           HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
2525         { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
2526           HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
2527         { HEX_DBL(+, 1, 3dea64c123422, +, 0),
2528           HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
2529         { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
2530           HEX_DBL(+, 1, d4397afec42e2, -, 56) },
2531         { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
2532           HEX_DBL(+, 1, 6324c054647ad, -, 54) },
2533         { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
2534           HEX_DBL(-, 1, bdd3413b26456, -, 54) }
2535 
2536     };
2537     int index = (int)rint(x.hi * 16.0);
2538     x.hi -= (double)index * 0.0625;
2539 
2540     // canonicalize x
2541     double temp = x.hi;
2542     x.hi += x.lo;
2543     x.lo -= x.hi - temp;
2544 
2545     // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
2546     // Error: 2 * 0x1.e112p-87
2547     const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
2548                                   HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
2549                                 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
2550                                   HEX_DBL(-, 1, 5e4987a631846, -, 57) },
2551                                 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
2552                                   HEX_DBL(-, 1, d323200a05713, -, 59) },
2553                                 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
2554                                   HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
2555                                 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
2556                                   HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
2557                                 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
2558                                   HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
2559                                 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
2560                                   HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
2561                                 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
2562                                   HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
2563                                 { HEX_DBL(+, 1, b52502b5e953, -, 24),
2564                                   HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
2565     size_t count = sizeof(c) / sizeof(c[0]);
2566 
2567     // Do polynomial
2568     double_double r = c[count - 1];
2569     for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
2570 
2571     // unwind approximation
2572     r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
2573 
2574     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
2575     //  exp2(x) = (r + 1) * correction = r * correction + correction
2576     r = mul_dd(r, corrections[index + 8]);
2577     r = add_dd(r, corrections[index + 8]);
2578 
2579     // Format result for output:
2580 
2581     // Get mantissa
2582     long double m = ((long double)r.hi + (long double)r.lo);
2583 
2584     // Handle a pesky overflow cases when long double = double
2585     if (i > 512)
2586     {
2587         m *= HEX_DBL(+, 1, 0, +, 512);
2588         i -= 512;
2589     }
2590     else if (i < -512)
2591     {
2592         m *= HEX_DBL(+, 1, 0, -, 512);
2593         i += 512;
2594     }
2595 
2596     return m * ldexpl(1.0L, i);
2597 }
2598 
2599 
fallback_frexp(double x,int * iptr)2600 static double fallback_frexp(double x, int *iptr)
2601 {
2602     cl_ulong u, v;
2603     double fu, fv;
2604 
2605     memcpy(&u, &x, sizeof(u));
2606 
2607     cl_ulong exponent = u & 0x7ff0000000000000ULL;
2608     cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
2609 
2610     // add 1 to the exponent
2611     exponent += 0x0010000000000000ULL;
2612 
2613     if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
2614     { // subnormal, NaN, Inf
2615         mantissa |= 0x3fe0000000000000ULL;
2616 
2617         v = mantissa & 0xfff0000000000000ULL;
2618         u = mantissa;
2619         memcpy(&fv, &v, sizeof(v));
2620         memcpy(&fu, &u, sizeof(u));
2621 
2622         fu -= fv;
2623 
2624         memcpy(&v, &fv, sizeof(v));
2625         memcpy(&u, &fu, sizeof(u));
2626 
2627         exponent = u & 0x7ff0000000000000ULL;
2628         mantissa = u & ~0x7ff0000000000000ULL;
2629 
2630         *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
2631         u = mantissa | 0x3fe0000000000000ULL;
2632         memcpy(&fu, &u, sizeof(u));
2633         return fu;
2634     }
2635 
2636     *iptr = (exponent >> 52) - 1023;
2637     u = mantissa | 0x3fe0000000000000ULL;
2638     memcpy(&fu, &u, sizeof(u));
2639     return fu;
2640 }
2641 
2642 // Assumes zeros, infinities and NaNs handed elsewhere
extract(double x,cl_ulong * mant)2643 static inline int extract(double x, cl_ulong *mant)
2644 {
2645     static double (*frexpp)(double, int *) = NULL;
2646     int e;
2647 
2648     // verify that frexp works properly
2649     if (NULL == frexpp)
2650     {
2651         if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
2652             frexpp = frexp;
2653         else
2654             frexpp = fallback_frexp;
2655     }
2656 
2657     *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
2658     return e - 1;
2659 }
2660 
2661 // Return 128-bit product of a*b  as (hi << 64) + lo
mul128(cl_ulong a,cl_ulong b,cl_ulong * hi,cl_ulong * lo)2662 static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
2663 {
2664     cl_ulong alo = a & 0xffffffffULL;
2665     cl_ulong ahi = a >> 32;
2666     cl_ulong blo = b & 0xffffffffULL;
2667     cl_ulong bhi = b >> 32;
2668     cl_ulong aloblo = alo * blo;
2669     cl_ulong alobhi = alo * bhi;
2670     cl_ulong ahiblo = ahi * blo;
2671     cl_ulong ahibhi = ahi * bhi;
2672 
2673     alobhi += (aloblo >> 32)
2674         + (ahiblo
2675            & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
2676                              // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2677     *hi = ahibhi + (alobhi >> 32)
2678         + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
2679                           // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2680     *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
2681 }
2682 
round_to_nearest_even_double(cl_ulong hi,cl_ulong lo,int exponent)2683 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
2684                                            int exponent)
2685 {
2686     union {
2687         cl_ulong u;
2688         cl_double d;
2689     } u;
2690 
2691     // edges
2692     if (exponent > 1023) return INFINITY;
2693     if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
2694         return HEX_DBL(+, 1, 0, -, 1074);
2695     if (exponent <= -1075) return 0.0;
2696 
2697     // Figure out which bits go where
2698     int shift = 11;
2699     if (exponent < -1022)
2700     {
2701         shift -= 1022 + exponent; // subnormal: shift is not 52
2702         exponent = -1023; //              set exponent to 0
2703     }
2704     else
2705         hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
2706                                      // it.
2707 
2708     // Assemble the double (round toward zero)
2709     u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
2710 
2711     // put a representation of the residual bits into hi
2712     hi <<= (64 - shift);
2713     hi |= lo >> shift;
2714     lo <<= (64 - shift);
2715     hi |= lo != 0;
2716 
2717     // round to nearest, ties to even
2718     if (hi < 0x8000000000000000ULL) return u.d;
2719     if (hi == 0x8000000000000000ULL)
2720         u.u += u.u & 1ULL;
2721     else
2722         u.u++;
2723 
2724     return u.d;
2725 }
2726 
2727 // Shift right.  Bits lost on the right will be OR'd together and OR'd with the
2728 // LSB
shift_right_sticky_128(cl_ulong * hi,cl_ulong * lo,int shift)2729 static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
2730 {
2731     cl_ulong sticky = 0;
2732     cl_ulong h = *hi;
2733     cl_ulong l = *lo;
2734 
2735     if (shift >= 64)
2736     {
2737         shift -= 64;
2738         sticky = 0 != lo;
2739         l = h;
2740         h = 0;
2741         if (shift >= 64)
2742         {
2743             sticky |= (0 != l);
2744             l = 0;
2745         }
2746         else
2747         {
2748             sticky |= (0 != (l << (64 - shift)));
2749             l >>= shift;
2750         }
2751     }
2752     else
2753     {
2754         sticky |= (0 != (l << (64 - shift)));
2755         l >>= shift;
2756         l |= h << (64 - shift);
2757         h >>= shift;
2758     }
2759 
2760     *lo = l | sticky;
2761     *hi = h;
2762 }
2763 
2764 // 128-bit add  of ((*hi << 64) + *lo) + ((chi << 64) + clo)
2765 // If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
2766 // with the LSB
add128(cl_ulong * hi,cl_ulong * lo,cl_ulong chi,cl_ulong clo,int * exponent)2767 static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
2768                           cl_ulong clo, int *exponent)
2769 {
2770     cl_ulong carry, carry2;
2771     // extended precision add
2772     clo = add_carry(*lo, clo, &carry);
2773     chi = add_carry(*hi, chi, &carry2);
2774     chi = add_carry(chi, carry, &carry);
2775 
2776     // If we overflowed the 128 bit result
2777     if (carry || carry2)
2778     {
2779         carry = clo & 1; // set aside low bit
2780         clo >>= 1; // right shift low 1
2781         clo |= carry; // or back in the low bit, so we don't come to believe
2782                       // this is an exact half way case for rounding
2783         clo |= chi << 63; // move lowest high bit into highest bit of lo
2784         chi >>= 1; // right shift hi
2785         chi |= 0x8000000000000000ULL; // move the carry bit into hi.
2786         *exponent = *exponent + 1;
2787     }
2788 
2789     *hi = chi;
2790     *lo = clo;
2791 }
2792 
2793 // 128-bit subtract  of ((chi << 64) + clo)  - ((*hi << 64) + *lo)
sub128(cl_ulong * chi,cl_ulong * clo,cl_ulong hi,cl_ulong lo,cl_ulong * signC,int * expC)2794 static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
2795                           cl_ulong lo, cl_ulong *signC, int *expC)
2796 {
2797     cl_ulong rHi = *chi;
2798     cl_ulong rLo = *clo;
2799     cl_ulong carry, carry2;
2800 
2801     // extended precision subtract
2802     rLo = sub_carry(rLo, lo, &carry);
2803     rHi = sub_carry(rHi, hi, &carry2);
2804     rHi = sub_carry(rHi, carry, &carry);
2805 
2806     // Check for sign flip
2807     if (carry || carry2)
2808     {
2809         *signC ^= 0x8000000000000000ULL;
2810 
2811         // negate rLo, rHi:   -x = (x ^ -1) + 1
2812         rLo ^= -1ULL;
2813         rHi ^= -1ULL;
2814         rLo++;
2815         rHi += 0 == rLo;
2816     }
2817 
2818     // normalize -- move the most significant non-zero bit to the MSB, and
2819     // adjust exponent accordingly
2820     if (rHi == 0)
2821     {
2822         rHi = rLo;
2823         *expC = *expC - 64;
2824         rLo = 0;
2825     }
2826 
2827     if (rHi)
2828     {
2829         int shift = 32;
2830         cl_ulong test = 1ULL << 32;
2831         while (0 == (rHi & 0x8000000000000000ULL))
2832         {
2833             if (rHi < test)
2834             {
2835                 rHi <<= shift;
2836                 rHi |= rLo >> (64 - shift);
2837                 rLo <<= shift;
2838                 *expC = *expC - shift;
2839             }
2840             shift >>= 1;
2841             test <<= shift;
2842         }
2843     }
2844     else
2845     {
2846         // zero
2847         *expC = INT_MIN;
2848         *signC = 0;
2849     }
2850 
2851 
2852     *chi = rHi;
2853     *clo = rLo;
2854 }
2855 
reference_fmal(long double x,long double y,long double z)2856 long double reference_fmal(long double x, long double y, long double z)
2857 {
2858     static const cl_ulong kMSB = 0x8000000000000000ULL;
2859 
2860     // cast values back to double. This is an exact function, so
2861     double a = x;
2862     double b = y;
2863     double c = z;
2864 
2865     // Make bits accessible
2866     union {
2867         cl_ulong u;
2868         cl_double d;
2869     } ua;
2870     ua.d = a;
2871     union {
2872         cl_ulong u;
2873         cl_double d;
2874     } ub;
2875     ub.d = b;
2876     union {
2877         cl_ulong u;
2878         cl_double d;
2879     } uc;
2880     uc.d = c;
2881 
2882     // deal with Nans, infinities and zeros
2883     if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
2884         || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
2885         0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
2886         0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
2887     {
2888         if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
2889 
2890         a = (double)reference_multiplyl(
2891             a, b); // some risk that the compiler will insert a non-compliant
2892                    // fma here on some platforms.
2893         return reference_addl(
2894             a,
2895             c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
2896     }
2897 
2898     // extract exponent and mantissa
2899     //   exponent is a standard unbiased signed integer
2900     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
2901     cl_ulong mantA, mantB, mantC;
2902     int expA = extract(a, &mantA);
2903     int expB = extract(b, &mantB);
2904     int expC = extract(c, &mantC);
2905     cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
2906                                   // decide if we are adding or subtracting
2907 
2908     // exact product of A and B
2909     int exponent = expA + expB;
2910     cl_ulong sign = (ua.u ^ ub.u) & kMSB;
2911     cl_ulong hi, lo;
2912     mul128(mantA, mantB, &hi, &lo);
2913 
2914     // renormalize
2915     if (0 == (kMSB & hi))
2916     {
2917         hi <<= 1;
2918         hi |= lo >> 63;
2919         lo <<= 1;
2920     }
2921     else
2922         exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
2923                     // exponent increased.
2924 
2925     // infinite precision add
2926     cl_ulong chi = mantC;
2927     cl_ulong clo = 0;
2928 
2929     if (exponent >= expC)
2930     {
2931         // Normalize C relative to the product
2932         if (exponent > expC)
2933             shift_right_sticky_128(&chi, &clo, exponent - expC);
2934 
2935         // Add
2936         if (sign ^ signC)
2937             sub128(&hi, &lo, chi, clo, &sign, &exponent);
2938         else
2939             add128(&hi, &lo, chi, clo, &exponent);
2940     }
2941     else
2942     {
2943         // Shift the product relative to C so that their exponents match
2944         shift_right_sticky_128(&hi, &lo, expC - exponent);
2945 
2946         // add
2947         if (sign ^ signC)
2948             sub128(&chi, &clo, hi, lo, &signC, &expC);
2949         else
2950             add128(&chi, &clo, hi, lo, &expC);
2951 
2952         hi = chi;
2953         lo = clo;
2954         exponent = expC;
2955         sign = signC;
2956     }
2957 
2958     // round
2959     ua.d = round_to_nearest_even_double(hi, lo, exponent);
2960 
2961     // Set the sign
2962     ua.u |= sign;
2963 
2964     return ua.d;
2965 }
2966 
2967 
reference_madl(long double a,long double b,long double c)2968 long double reference_madl(long double a, long double b, long double c)
2969 {
2970     return a * b + c;
2971 }
2972 
reference_recipl(long double x)2973 long double reference_recipl(long double x) { return 1.0L / x; }
2974 
reference_rootnl(long double x,int i)2975 long double reference_rootnl(long double x, int i)
2976 {
2977     // rootn ( x, 0 )  returns a NaN.
2978     if (0 == i) return cl_make_nan();
2979 
2980     // rootn ( x, n )  returns a NaN for x < 0 and n is even.
2981     if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
2982 
2983     if (isinf(x))
2984     {
2985         if (i < 0) return reference_copysignl(0.0L, x);
2986 
2987         return x;
2988     }
2989 
2990     if (x == 0.0)
2991     {
2992         switch (i & 0x80000001)
2993         {
2994             // rootn ( +-0,  n ) is +0 for even n > 0.
2995             case 0: return 0.0L;
2996 
2997             // rootn ( +-0,  n ) is +-0 for odd n > 0.
2998             case 1: return x;
2999 
3000             // rootn ( +-0,  n ) is +inf for even n < 0.
3001             case 0x80000000: return INFINITY;
3002 
3003             // rootn ( +-0,  n ) is +-inf for odd n < 0.
3004             case 0x80000001: return copysign(INFINITY, x);
3005         }
3006     }
3007 
3008     if (i == 1) return x;
3009 
3010     if (i == -1) return 1.0 / x;
3011 
3012     long double sign = x;
3013     x = reference_fabsl(x);
3014     double iHi, iLo;
3015     DivideDD(&iHi, &iLo, 1.0, i);
3016     x = reference_powl(x, iHi) * reference_powl(x, iLo);
3017 
3018     return reference_copysignl(x, sign);
3019 }
3020 
reference_rsqrtl(long double x)3021 long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
3022 
reference_sinpil(long double x)3023 long double reference_sinpil(long double x)
3024 {
3025     double r = reduce1l(x);
3026 
3027     // reduce to [-0.5, 0.5]
3028     if (r < -0.5L)
3029         r = -1.0L - r;
3030     else if (r > 0.5L)
3031         r = 1.0L - r;
3032 
3033     // sinPi zeros have the same sign as x
3034     if (r == 0.0L) return reference_copysignl(0.0L, x);
3035 
3036     return reference_sinl(r * M_PIL);
3037 }
3038 
reference_tanpil(long double x)3039 long double reference_tanpil(long double x)
3040 {
3041     // set aside the sign  (allows us to preserve sign of -0)
3042     long double sign = reference_copysignl(1.0L, x);
3043     long double z = reference_fabsl(x);
3044 
3045     // if big and even  -- caution: only works if x only has single precision
3046     if (z >= HEX_LDBL(+, 1, 0, +, 53))
3047     {
3048         if (z == INFINITY) return x - x; // nan
3049 
3050         return reference_copysignl(
3051             0.0L, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
3052     }
3053 
3054     // reduce to the range [ -0.5, 0.5 ]
3055     long double nearest =
3056         reference_rintl(z); // round to nearest even places n + 0.5 values in
3057                             // the right place for us
3058     int64_t i =
3059         (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
3060     z -= nearest;
3061 
3062     // correction for odd integer x for the right sign of zero
3063     if ((i & 1) && z == 0.0L) sign = -sign;
3064 
3065     // track changes to the sign
3066     sign *= reference_copysignl(1.0L, z); // really should just be an xor
3067     z = reference_fabsl(z); // remove the sign again
3068 
3069     // reduce once more
3070     // If we don't do this, rounding error in z * M_PI will cause us not to
3071     // return infinities properly
3072     if (z > 0.25L)
3073     {
3074         z = 0.5L - z;
3075         return sign
3076             / reference_tanl(z
3077                              * M_PIL); // use system tan to get the right result
3078     }
3079 
3080     //
3081     return sign
3082         * reference_tanl(z * M_PIL); // use system tan to get the right result
3083 }
3084 
reference_pownl(long double x,int i)3085 long double reference_pownl(long double x, int i)
3086 {
3087     return reference_powl(x, (long double)i);
3088 }
3089 
reference_powrl(long double x,long double y)3090 long double reference_powrl(long double x, long double y)
3091 {
3092     // powr ( x, y ) returns NaN for x < 0.
3093     if (x < 0.0L) return cl_make_nan();
3094 
3095     // powr ( x, NaN ) returns the NaN for x >= 0.
3096     // powr ( NaN, y ) returns the NaN.
3097     if (isnan(x) || isnan(y))
3098         return x + y; // Note: behavior different here than for pow(1,NaN),
3099                       // pow(NaN, 0)
3100 
3101     if (x == 1.0L)
3102     {
3103         // powr ( +1, +-inf ) returns NaN.
3104         if (reference_fabsl(y) == INFINITY) return cl_make_nan();
3105 
3106         // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
3107         return 1.0L;
3108     }
3109 
3110     if (y == 0.0L)
3111     {
3112         // powr ( +inf, +-0 ) returns NaN.
3113         // powr ( +-0, +-0 ) returns NaN.
3114         if (x == 0.0L || x == INFINITY) return cl_make_nan();
3115 
3116         // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
3117         // handled above)
3118         return 1.0L;
3119     }
3120 
3121     if (x == 0.0L)
3122     {
3123         // powr ( +-0, -inf) is +inf.
3124         // powr ( +-0, y ) is +inf for finite y < 0.
3125         if (y < 0.0L) return INFINITY;
3126 
3127         // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
3128         return 0.0L;
3129     }
3130 
3131     return reference_powl(x, y);
3132 }
3133 
reference_addl(long double x,long double y)3134 long double reference_addl(long double x, long double y)
3135 {
3136     volatile double a = (double)x;
3137     volatile double b = (double)y;
3138 
3139 #if defined(__SSE2__)
3140     // defeat x87
3141     __m128d va = _mm_set_sd((double)a);
3142     __m128d vb = _mm_set_sd((double)b);
3143     va = _mm_add_sd(va, vb);
3144     _mm_store_sd((double *)&a, va);
3145 #else
3146     a += b;
3147 #endif
3148     return (long double)a;
3149 }
3150 
reference_subtractl(long double x,long double y)3151 long double reference_subtractl(long double x, long double y)
3152 {
3153     volatile double a = (double)x;
3154     volatile double b = (double)y;
3155 
3156 #if defined(__SSE2__)
3157     // defeat x87
3158     __m128d va = _mm_set_sd((double)a);
3159     __m128d vb = _mm_set_sd((double)b);
3160     va = _mm_sub_sd(va, vb);
3161     _mm_store_sd((double *)&a, va);
3162 #else
3163     a -= b;
3164 #endif
3165     return (long double)a;
3166 }
3167 
reference_multiplyl(long double x,long double y)3168 long double reference_multiplyl(long double x, long double y)
3169 {
3170     volatile double a = (double)x;
3171     volatile double b = (double)y;
3172 
3173 #if defined(__SSE2__)
3174     // defeat x87
3175     __m128d va = _mm_set_sd((double)a);
3176     __m128d vb = _mm_set_sd((double)b);
3177     va = _mm_mul_sd(va, vb);
3178     _mm_store_sd((double *)&a, va);
3179 #else
3180     a *= b;
3181 #endif
3182     return (long double)a;
3183 }
3184 
reference_lgamma_rl(long double x,int * signp)3185 long double reference_lgamma_rl(long double x, int *signp)
3186 {
3187     *signp = 0;
3188     return x;
3189 }
3190 
reference_isequall(long double x,long double y)3191 int reference_isequall(long double x, long double y) { return x == y; }
reference_isfinitel(long double x)3192 int reference_isfinitel(long double x) { return 0 != isfinite(x); }
reference_isgreaterl(long double x,long double y)3193 int reference_isgreaterl(long double x, long double y) { return x > y; }
reference_isgreaterequall(long double x,long double y)3194 int reference_isgreaterequall(long double x, long double y) { return x >= y; }
reference_isinfl(long double x)3195 int reference_isinfl(long double x) { return 0 != isinf(x); }
reference_islessl(long double x,long double y)3196 int reference_islessl(long double x, long double y) { return x < y; }
reference_islessequall(long double x,long double y)3197 int reference_islessequall(long double x, long double y) { return x <= y; }
3198 #if defined(__INTEL_COMPILER)
reference_islessgreaterl(long double x,long double y)3199 int reference_islessgreaterl(long double x, long double y)
3200 {
3201     return 0 != islessgreaterl(x, y);
3202 }
3203 #else
reference_islessgreaterl(long double x,long double y)3204 int reference_islessgreaterl(long double x, long double y)
3205 {
3206     return 0 != islessgreater(x, y);
3207 }
3208 #endif
reference_isnanl(long double x)3209 int reference_isnanl(long double x) { return 0 != isnan(x); }
reference_isnormall(long double x)3210 int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
reference_isnotequall(long double x,long double y)3211 int reference_isnotequall(long double x, long double y) { return x != y; }
reference_isorderedl(long double x,long double y)3212 int reference_isorderedl(long double x, long double y)
3213 {
3214     return x == x && y == y;
3215 }
reference_isunorderedl(long double x,long double y)3216 int reference_isunorderedl(long double x, long double y)
3217 {
3218     return isnan(x) || isnan(y);
3219 }
3220 #if defined(__INTEL_COMPILER)
reference_signbitl(long double x)3221 int reference_signbitl(long double x) { return 0 != signbitl(x); }
3222 #else
reference_signbitl(long double x)3223 int reference_signbitl(long double x) { return 0 != signbit(x); }
3224 #endif
3225 long double reference_copysignl(long double x, long double y);
3226 long double reference_roundl(long double x);
3227 long double reference_cbrtl(long double x);
3228 
reference_copysignl(long double x,long double y)3229 long double reference_copysignl(long double x, long double y)
3230 {
3231     // We hope that the long double to double conversion proceeds with sign
3232     // fidelity, even for zeros and NaNs
3233     union {
3234         double d;
3235         cl_ulong u;
3236     } u;
3237     u.d = (double)y;
3238 
3239     x = reference_fabsl(x);
3240     if (u.u >> 63) x = -x;
3241 
3242     return x;
3243 }
3244 
reference_roundl(long double x)3245 long double reference_roundl(long double x)
3246 {
3247     // Since we are just using this to verify double precision, we can
3248     // use the double precision copysign here
3249 
3250 #if defined(__MINGW32__) && defined(__x86_64__)
3251     long double absx = reference_fabsl(x);
3252     if (absx < 0.5L) return reference_copysignl(0.0L, x);
3253 #endif
3254     return round((double)x);
3255 }
3256 
reference_truncl(long double x)3257 long double reference_truncl(long double x)
3258 {
3259     // Since we are just using this to verify double precision, we can
3260     // use the double precision copysign here
3261     return trunc((double)x);
3262 }
3263 
3264 static long double reference_scalblnl(long double x, long n);
3265 
reference_cbrtl(long double x)3266 long double reference_cbrtl(long double x)
3267 {
3268     double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
3269     double ylo = HEX_DBL(+, 1, 558, -, 56);
3270 
3271     double fabsx = reference_fabs(x);
3272 
3273     if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
3274 
3275     double log2x_hi, log2x_lo;
3276 
3277     // extended precision log .... accurate to at least 64-bits + couple of
3278     // guard bits
3279     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
3280 
3281     double ylog2x_hi, ylog2x_lo;
3282 
3283     double y_hi = yhi;
3284     double y_lo = ylo;
3285 
3286     // compute product of y*log2(x)
3287     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
3288 
3289     long double powxy;
3290     if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
3291     {
3292         powxy =
3293             reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
3294     }
3295     else
3296     {
3297         // separate integer + fractional part
3298         long int m = lrint(ylog2x_hi);
3299         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
3300 
3301         // revert to long double arithemtic
3302         long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
3303         powxy = reference_exp2l(ylog2x);
3304         powxy = reference_scalblnl(powxy, m);
3305     }
3306 
3307     return reference_copysignl(powxy, x);
3308 }
3309 
reference_rintl(long double x)3310 long double reference_rintl(long double x)
3311 {
3312 #if defined(__PPC__)
3313     // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
3314     // mantissa can represent more than LDBL_MANT_DIG binary digits.
3315     x = rintl(x);
3316 #else
3317     static long double magic[2] = { 0.0L, 0.0L };
3318 
3319     if (0.0L == magic[0])
3320     {
3321         magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
3322         magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
3323     }
3324 
3325     if (reference_fabsl(x) < magic[0] && x != 0.0L)
3326     {
3327         long double m = magic[x < 0];
3328         x += m;
3329         x -= m;
3330     }
3331 #endif // __PPC__
3332     return x;
3333 }
3334 
3335 // extended precision sqrt using newton iteration on 1/sqrt(x).
3336 // Final result is computed as x * 1/sqrt(x)
__sqrt_ep(double * rhi,double * rlo,double xhi,double xlo)3337 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
3338 {
3339     // approximate reciprocal sqrt
3340     double thi = 1.0 / sqrt(xhi);
3341     double tlo = 0.0;
3342 
3343     // One newton iteration in double-double
3344     double yhi, ylo;
3345     MulDD(&yhi, &ylo, thi, tlo, thi, tlo);
3346     MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo);
3347     AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0);
3348     MulDD(&yhi, &ylo, yhi, ylo, thi, tlo);
3349     MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0);
3350 
3351     MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
3352 }
3353 
reference_acoshl(long double x)3354 long double reference_acoshl(long double x)
3355 {
3356     /*
3357      * ====================================================
3358      * This function derived from fdlibm http://www.netlib.org
3359      * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3360      *
3361      * Developed at SunSoft, a Sun Microsystems, Inc. business.
3362      * Permission to use, copy, modify, and distribute this
3363      * software is freely granted, provided that this notice
3364      * is preserved.
3365      * ====================================================
3366      *
3367      */
3368     if (isnan(x) || isinf(x)) return x + fabsl(x);
3369 
3370     if (x < 1.0L) return cl_make_nan();
3371 
3372     if (x == 1.0L) return 0.0L;
3373 
3374     if (x > HEX_LDBL(+, 1, 0, +, 60))
3375         return reference_logl(x) + 0.693147180559945309417232121458176568L;
3376 
3377     if (x > 2.0L)
3378         return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
3379 
3380     double hi, lo;
3381     MulD(&hi, &lo, x, x);
3382     AddDD(&hi, &lo, hi, lo, -1.0, 0.0);
3383     __sqrt_ep(&hi, &lo, hi, lo);
3384     AddDD(&hi, &lo, hi, lo, x, 0.0);
3385     double correction = lo / hi;
3386     __log2_ep(&hi, &lo, hi);
3387     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
3388     double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
3389     MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
3390     AddDD(&hi, &lo, hi, lo, correction, 0.0);
3391 
3392     return hi + lo;
3393 }
3394 
reference_asinhl(long double x)3395 long double reference_asinhl(long double x)
3396 {
3397     long double cutoff = 0.0L;
3398     const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
3399 
3400     if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
3401 
3402     if (isnan(x) || isinf(x)) return x + x;
3403 
3404     long double absx = reference_fabsl(x);
3405     if (absx < cutoff) return x;
3406 
3407     long double sign = reference_copysignl(1.0L, x);
3408 
3409     if (absx <= 4.0 / 3.0)
3410     {
3411         return sign
3412             * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
3413     }
3414     else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
3415     {
3416         return sign
3417             * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
3418     }
3419     else
3420     {
3421         return sign * (reference_logl(absx) + ln2);
3422     }
3423 }
3424 
reference_atanhl(long double x)3425 long double reference_atanhl(long double x)
3426 {
3427     /*
3428      * ====================================================
3429      * This function is from fdlibm: http://www.netlib.org
3430      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3431      *
3432      * Developed at SunSoft, a Sun Microsystems, Inc. business.
3433      * Permission to use, copy, modify, and distribute this
3434      * software is freely granted, provided that this notice
3435      * is preserved.
3436      * ====================================================
3437      */
3438     if (isnan(x)) return x + x;
3439 
3440     long double signed_half = reference_copysignl(0.5L, x);
3441     x = reference_fabsl(x);
3442     if (x > 1.0L) return cl_make_nan();
3443 
3444     if (x < 0.5L)
3445         return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
3446 
3447     return signed_half * reference_log1pl(2.0L * x / (1 - x));
3448 }
3449 
reference_exp2l(long double z)3450 long double reference_exp2l(long double z)
3451 {
3452     double_double x;
3453     int j;
3454 
3455     // Handle NaNs
3456     if (isnan(z)) return z;
3457 
3458     // init x
3459     x.hi = z;
3460     x.lo = z - x.hi;
3461 
3462     // Deal with overflow and underflow for exp2(x) stage next
3463     if (x.hi >= 1025) return INFINITY;
3464 
3465     if (x.hi < -1075 - 24) return +0.0;
3466 
3467     // find nearest integer to x
3468     int i = (int)rint(x.hi);
3469 
3470     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
3471     x.hi -= i;
3472 
3473     // We could attempt to find a minimax polynomial for exp2(x) over the range
3474     // x = [-0.5, 0.5]. However, this would converge very slowly near the
3475     // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
3476     // requiring something like a 20th order polynomial to get 53 + 24 bits of
3477     // precision. Instead we further reduce the range to [-1/32, 1/32] by
3478     // observing that
3479     //
3480     //  2**(a+b) = 2**a * 2**b
3481     //
3482     // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
3483     // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
3484     // value of n/16 from x.
3485     const double_double corrections[17] = {
3486         { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
3487           HEX_DBL(-, 1, bdd3413b26456, -, 55) },
3488         { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
3489           HEX_DBL(-, 1, 41577ee04992f, -, 56) },
3490         { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
3491           HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
3492         { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
3493           HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
3494         { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
3495           HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
3496         { HEX_DBL(+, 1, c199bdd85529c, -, 1),
3497           HEX_DBL(+, 1, 11065895048dd, -, 56) },
3498         { HEX_DBL(+, 1, d5818dcfba487, -, 1),
3499           HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
3500         { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
3501           HEX_DBL(-, 1, e9c23179c2893, -, 55) },
3502         { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
3503         { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
3504           HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
3505         { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
3506           HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
3507         { HEX_DBL(+, 1, 2387a6e756238, +, 0),
3508           HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
3509         { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
3510           HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
3511         { HEX_DBL(+, 1, 3dea64c123422, +, 0),
3512           HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
3513         { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
3514           HEX_DBL(+, 1, d4397afec42e2, -, 56) },
3515         { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
3516           HEX_DBL(+, 1, 6324c054647ad, -, 54) },
3517         { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
3518           HEX_DBL(-, 1, bdd3413b26456, -, 54) }
3519     };
3520     int index = (int)rint(x.hi * 16.0);
3521     x.hi -= (double)index * 0.0625;
3522 
3523     // canonicalize x
3524     double temp = x.hi;
3525     x.hi += x.lo;
3526     x.lo -= x.hi - temp;
3527 
3528     // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
3529     // Error: 2 * 0x1.e112p-87
3530     const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
3531                                   HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
3532                                 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
3533                                   HEX_DBL(-, 1, 5e4987a631846, -, 57) },
3534                                 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
3535                                   HEX_DBL(-, 1, d323200a05713, -, 59) },
3536                                 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
3537                                   HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
3538                                 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
3539                                   HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
3540                                 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
3541                                   HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
3542                                 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
3543                                   HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
3544                                 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
3545                                   HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
3546                                 { HEX_DBL(+, 1, b52502b5e953, -, 24),
3547                                   HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
3548     size_t count = sizeof(c) / sizeof(c[0]);
3549 
3550     // Do polynomial
3551     double_double r = c[count - 1];
3552     for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
3553 
3554     // unwind approximation
3555     r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
3556 
3557     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
3558     //  exp2(x) = (r + 1) * correction = r * correction + correction
3559     r = mul_dd(r, corrections[index + 8]);
3560     r = add_dd(r, corrections[index + 8]);
3561 
3562     // Format result for output:
3563 
3564     // Get mantissa
3565     long double m = ((long double)r.hi + (long double)r.lo);
3566 
3567     // Handle a pesky overflow cases when long double = double
3568     if (i > 512)
3569     {
3570         m *= HEX_DBL(+, 1, 0, +, 512);
3571         i -= 512;
3572     }
3573     else if (i < -512)
3574     {
3575         m *= HEX_DBL(+, 1, 0, -, 512);
3576         i += 512;
3577     }
3578 
3579     return m * ldexpl(1.0L, i);
3580 }
3581 
reference_expm1l(long double x)3582 long double reference_expm1l(long double x)
3583 {
3584 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3585     // unimplemented
3586     return x;
3587 #else
3588     if (reference_isnanl(x)) return x;
3589 
3590     if (x > 710) return INFINITY;
3591 
3592     long double y = expm1l(x);
3593 
3594     // Range of expm1l is -1.0L to +inf. Negative inf
3595     // on a few Linux platforms is clearly the wrong sign.
3596     if (reference_isinfl(y)) y = INFINITY;
3597 
3598     return y;
3599 #endif
3600 }
3601 
reference_fmaxl(long double x,long double y)3602 long double reference_fmaxl(long double x, long double y)
3603 {
3604     if (isnan(y)) return x;
3605 
3606     return x >= y ? x : y;
3607 }
3608 
reference_fminl(long double x,long double y)3609 long double reference_fminl(long double x, long double y)
3610 {
3611     if (isnan(y)) return x;
3612 
3613     return x <= y ? x : y;
3614 }
3615 
reference_hypotl(long double x,long double y)3616 long double reference_hypotl(long double x, long double y)
3617 {
3618     static const double tobig = HEX_DBL(+, 1, 0, +, 511);
3619     static const double big = HEX_DBL(+, 1, 0, +, 513);
3620     static const double rbig = HEX_DBL(+, 1, 0, -, 513);
3621     static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
3622     static const double smalll = HEX_DBL(+, 1, 0, -, 607);
3623     static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
3624 
3625     long double max, min;
3626 
3627     if (isinf(x) || isinf(y)) return INFINITY;
3628 
3629     if (isnan(x) || isnan(y)) return x + y;
3630 
3631     x = reference_fabsl(x);
3632     y = reference_fabsl(y);
3633 
3634     max = reference_fmaxl(x, y);
3635     min = reference_fminl(x, y);
3636 
3637     if (max > tobig)
3638     {
3639         max *= rbig;
3640         min *= rbig;
3641         return big * sqrtl(max * max + min * min);
3642     }
3643 
3644     if (max < tosmall)
3645     {
3646         max *= rsmall;
3647         min *= rsmall;
3648         return smalll * sqrtl(max * max + min * min);
3649     }
3650     return sqrtl(x * x + y * y);
3651 }
3652 
reference_log2l(long double x)3653 long double reference_log2l(long double x)
3654 {
3655     if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
3656 
3657     if (x == 0.0f) return -INFINITY;
3658 
3659     if (x == INFINITY) return INFINITY;
3660 
3661     double hi, lo;
3662     __log2_ep(&hi, &lo, x);
3663 
3664     return (long double)hi + (long double)lo;
3665 }
3666 
reference_log1pl(long double x)3667 long double reference_log1pl(long double x)
3668 {
3669 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3670     // unimplemented
3671     return x;
3672 #elif defined(__PPC__)
3673     // log1pl on PPC inadvertantly returns NaN for very large values. Work
3674     // around this limitation by returning logl for large values.
3675     return ((x > (long double)(0x1.0p+1022)) ? logl(x) : log1pl(x));
3676 #else
3677     return log1pl(x);
3678 #endif
3679 }
3680 
reference_logbl(long double x)3681 long double reference_logbl(long double x)
3682 {
3683     // Since we are just using this to verify double precision, we can
3684     // use the double precision copysign here
3685     union {
3686         double f;
3687         cl_ulong u;
3688     } u;
3689     u.f = (double)x;
3690 
3691     cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
3692     if (exponent == 0x7ff) return x * x;
3693 
3694     if (exponent == 0)
3695     { // deal with denormals
3696         u.f = x * HEX_DBL(+, 1, 0, +, 64);
3697         exponent = (cl_int)(u.u >> 52) & 0x7ff;
3698         if (exponent == 0) return -INFINITY;
3699 
3700         return exponent - (1023 + 64);
3701     }
3702 
3703     return exponent - 1023;
3704 }
3705 
reference_maxmagl(long double x,long double y)3706 long double reference_maxmagl(long double x, long double y)
3707 {
3708     long double fabsx = fabsl(x);
3709     long double fabsy = fabsl(y);
3710 
3711     if (fabsx < fabsy) return y;
3712 
3713     if (fabsy < fabsx) return x;
3714 
3715     return reference_fmaxl(x, y);
3716 }
3717 
reference_minmagl(long double x,long double y)3718 long double reference_minmagl(long double x, long double y)
3719 {
3720     long double fabsx = fabsl(x);
3721     long double fabsy = fabsl(y);
3722 
3723     if (fabsx > fabsy) return y;
3724 
3725     if (fabsy > fabsx) return x;
3726 
3727     return reference_fminl(x, y);
3728 }
3729 
reference_nanl(cl_ulong x)3730 long double reference_nanl(cl_ulong x)
3731 {
3732     union {
3733         cl_ulong u;
3734         cl_double f;
3735     } u;
3736     u.u = x | 0x7ff8000000000000ULL;
3737     return (long double)u.f;
3738 }
3739 
3740 
reference_reciprocall(long double x)3741 long double reference_reciprocall(long double x) { return 1.0L / x; }
3742 
reference_remainderl(long double x,long double y)3743 long double reference_remainderl(long double x, long double y)
3744 {
3745     int i;
3746     return reference_remquol(x, y, &i);
3747 }
3748 
reference_lgammal(long double x)3749 long double reference_lgammal(long double x)
3750 {
3751     // lgamma is currently not tested
3752     return reference_lgamma(x);
3753 }
3754 
3755 static uint32_t two_over_pi[] = {
3756     0x0,        0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
3757     0x1993c439, 0x107f945,  0x23abdebb, 0x31586dc9, 0x6e3a424,  0x374b8019,
3758     0x92eea09,  0x3464873f, 0x21deb1cb, 0x4a69cfb,  0x288235f5, 0xbaed121,
3759     0xe99c702,  0x1ad17df9, 0x13991d6,  0xe60d4ce,  0x1f49c845, 0x3e2ef7e4,
3760     0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d,  0xd9fb3c9,
3761     0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
3762     0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
3763     0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
3764     0xdffd880,  0x135cc9cc, 0x10606155
3765 };
3766 
3767 static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
3768                                   0x26263314, 0x1701b839, 0x28948127 };
3769 
3770 typedef union {
3771     uint64_t u;
3772     double d;
3773 } d_ui64_t;
3774 
3775 // radix or base of representation
3776 #define RADIX (30)
3777 #define DIGITS 6
3778 
3779 d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
3780 d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
3781 d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
3782 
3783 #define tp_pradix two_pow_pradix.d
3784 #define tp_mradix two_pow_mradix.d
3785 
3786 // extended fixed point representation of double precision
3787 // floating point number.
3788 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
3789 typedef struct
3790 {
3791     uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
3792                    // base_30
3793     int index; // exponent bias
3794     int sign; // sign of double
3795 } eprep_t;
3796 
double_to_eprep(double x)3797 static eprep_t double_to_eprep(double x)
3798 {
3799     eprep_t result;
3800 
3801     result.sign = (signbit(x) == 0) ? 1 : -1;
3802     x = fabs(x);
3803 
3804     int index = 0;
3805     while (x > tp_pradix)
3806     {
3807         index++;
3808         x *= tp_mradix;
3809     }
3810     while (x < 1)
3811     {
3812         index--;
3813         x *= tp_pradix;
3814     }
3815 
3816     result.index = index;
3817     int i = 0;
3818     result.X[0] = result.X[1] = result.X[2] = 0;
3819     while (x != 0.0)
3820     {
3821         result.X[i] = (uint32_t)x;
3822         x = (x - (double)result.X[i]) * tp_pradix;
3823         i++;
3824     }
3825     return result;
3826 }
3827 
eprep_to_double(eprep_t epx)3828 static double eprep_to_double(eprep_t epx)
3829 {
3830     double res = 0.0;
3831 
3832     res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
3833     res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
3834     res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
3835 
3836     return copysign(res, epx.sign);
3837 }
3838 
payne_hanek(double * y,int * exception)3839 static int payne_hanek(double *y, int *exception)
3840 {
3841     double x = *y;
3842 
3843     // exception cases .. no reduction required
3844     if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
3845     {
3846         *exception = 1;
3847         return 0;
3848     }
3849 
3850     *exception = 0;
3851 
3852     // After computation result[0] contains integer part while
3853     // result[1]....result[DIGITS-1] contain fractional part. So we are doing
3854     // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
3855     // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
3856     // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
3857     // which can cause 61 digits of cancellation in computation of f = x*2/pi -
3858     // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
3859     // bits of mentissa of f) of precision to accurately compute f in double
3860     // precision. Since we are using 150 bits (still an overkill), we should be
3861     // safe. Extra bits can act as guard bits for correct rounding.
3862     uint64_t result[DIGITS + 2];
3863 
3864     // compute extended precision representation of x
3865     eprep_t epx = double_to_eprep(x);
3866     int index = epx.index;
3867     int i, j;
3868     // extended precision multiplication of 2/pi*x .... we will loose at max two
3869     // RADIX=30 bit digits in the worst case
3870     for (i = 0; i < (DIGITS + 2); i++)
3871     {
3872         result[i] = 0;
3873         result[i] += ((index + i - 0) >= 0)
3874             ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
3875             : 0;
3876         result[i] += ((index + i - 1) >= 0)
3877             ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
3878             : 0;
3879         result[i] += ((index + i - 2) >= 0)
3880             ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
3881             : 0;
3882     }
3883 
3884     // Carry propagation.
3885     uint64_t tmp;
3886     for (i = DIGITS + 2 - 1; i > 0; i--)
3887     {
3888         tmp = result[i] >> RADIX;
3889         result[i - 1] += tmp;
3890         result[i] -= (tmp << RADIX);
3891     }
3892 
3893     // we dont ned to normalize the integer part since only last two bits of
3894     // this will be used subsequently algorithm which remain unaltered by this
3895     // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
3896     unsigned int N = (unsigned int)result[0];
3897 
3898     // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
3899     // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
3900     // testing, at this stage, if r[1] (the first fractional digit) is greater
3901     // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
3902     // mentioned range is equivalent to substracting fractional part at this
3903     // stage from one and changing the sign.
3904     int sign = 1;
3905     if (result[1] > (uint64_t)(1 << (RADIX - 1)))
3906     {
3907         for (i = 1; i < (DIGITS + 2); i++)
3908             result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
3909         N += 1;
3910         sign = -1;
3911     }
3912 
3913     // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
3914     // the worst case (when x is multiple of 2/pi very close to an integer) so
3915     // we need to get rid of these zeros and adjust the index of final result.
3916     // So in the worst case, precision of comupted result is 90 bits (150 bits
3917     // original bits - 60 lost in cancellation).
3918     int ind = 1;
3919     for (i = 1; i < (DIGITS + 2); i++)
3920     {
3921         if (result[i] != 0)
3922             break;
3923         else
3924             ind++;
3925     }
3926 
3927     uint64_t r[DIGITS - 1];
3928     for (i = 0; i < (DIGITS - 1); i++)
3929     {
3930         r[i] = 0;
3931         for (j = 0; j <= i; j++)
3932         {
3933             r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
3934         }
3935     }
3936     for (i = (DIGITS - 2); i > 0; i--)
3937     {
3938         tmp = r[i] >> RADIX;
3939         r[i - 1] += tmp;
3940         r[i] -= (tmp << RADIX);
3941     }
3942     tmp = r[0] >> RADIX;
3943     r[0] -= (tmp << RADIX);
3944 
3945     eprep_t epr;
3946     epr.sign = epx.sign * sign;
3947     if (tmp != 0)
3948     {
3949         epr.index = -ind + 1;
3950         epr.X[0] = (uint32_t)tmp;
3951         epr.X[1] = (uint32_t)r[0];
3952         epr.X[2] = (uint32_t)r[1];
3953     }
3954     else
3955     {
3956         epr.index = -ind;
3957         epr.X[0] = (uint32_t)r[0];
3958         epr.X[1] = (uint32_t)r[1];
3959         epr.X[2] = (uint32_t)r[2];
3960     }
3961 
3962     *y = eprep_to_double(epr);
3963     return epx.sign * N;
3964 }
3965 
reference_relaxed_cos(double x)3966 double reference_relaxed_cos(double x)
3967 {
3968     if (isnan(x)) return NAN;
3969     return (float)cos((float)x);
3970 }
3971 
reference_cos(double x)3972 double reference_cos(double x)
3973 {
3974     int exception;
3975     int N = payne_hanek(&x, &exception);
3976     if (exception) return cos(x);
3977     unsigned int c = N & 3;
3978     switch (c)
3979     {
3980         case 0: return cos(x);
3981         case 1: return -sin(x);
3982         case 2: return -cos(x);
3983         case 3: return sin(x);
3984     }
3985     return 0.0;
3986 }
3987 
reference_relaxed_sin(double x)3988 double reference_relaxed_sin(double x) { return (float)sin((float)x); }
3989 
reference_sin(double x)3990 double reference_sin(double x)
3991 {
3992     int exception;
3993     int N = payne_hanek(&x, &exception);
3994     if (exception) return sin(x);
3995     int c = N & 3;
3996     switch (c)
3997     {
3998         case 0: return sin(x);
3999         case 1: return cos(x);
4000         case 2: return -sin(x);
4001         case 3: return -cos(x);
4002     }
4003     return 0.0;
4004 }
4005 
reference_relaxed_sincos(double x,double * y)4006 double reference_relaxed_sincos(double x, double *y)
4007 {
4008     *y = reference_relaxed_cos(x);
4009     return reference_relaxed_sin(x);
4010 }
4011 
reference_sincos(double x,double * y)4012 double reference_sincos(double x, double *y)
4013 {
4014     int exception;
4015     int N = payne_hanek(&x, &exception);
4016     if (exception)
4017     {
4018         *y = cos(x);
4019         return sin(x);
4020     }
4021     int c = N & 3;
4022     switch (c)
4023     {
4024         case 0: *y = cos(x); return sin(x);
4025         case 1: *y = -sin(x); return cos(x);
4026         case 2: *y = -cos(x); return -sin(x);
4027         case 3: *y = sin(x); return -cos(x);
4028     }
4029     return 0.0;
4030 }
4031 
reference_relaxed_tan(double x)4032 double reference_relaxed_tan(double x)
4033 {
4034     return ((float)reference_relaxed_sin((float)x))
4035         / ((float)reference_relaxed_cos((float)x));
4036 }
4037 
reference_tan(double x)4038 double reference_tan(double x)
4039 {
4040     int exception;
4041     int N = payne_hanek(&x, &exception);
4042     if (exception) return tan(x);
4043     int c = N & 3;
4044     switch (c)
4045     {
4046         case 0: return tan(x);
4047         case 1: return -1.0 / tan(x);
4048         case 2: return tan(x);
4049         case 3: return -1.0 / tan(x);
4050     }
4051     return 0.0;
4052 }
4053 
reference_cosl(long double xx)4054 long double reference_cosl(long double xx)
4055 {
4056     double x = (double)xx;
4057     int exception;
4058     int N = payne_hanek(&x, &exception);
4059     if (exception) return cosl(x);
4060     unsigned int c = N & 3;
4061     switch (c)
4062     {
4063         case 0: return cosl(x);
4064         case 1: return -sinl(x);
4065         case 2: return -cosl(x);
4066         case 3: return sinl(x);
4067     }
4068     return 0.0;
4069 }
4070 
reference_sinl(long double xx)4071 long double reference_sinl(long double xx)
4072 {
4073     // we use system tanl after reduction which
4074     // can flush denorm input to zero so
4075     // take care of it here.
4076     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4077 
4078     double x = (double)xx;
4079     int exception;
4080     int N = payne_hanek(&x, &exception);
4081     if (exception) return sinl(x);
4082     int c = N & 3;
4083     switch (c)
4084     {
4085         case 0: return sinl(x);
4086         case 1: return cosl(x);
4087         case 2: return -sinl(x);
4088         case 3: return -cosl(x);
4089     }
4090     return 0.0;
4091 }
4092 
reference_sincosl(long double xx,long double * y)4093 long double reference_sincosl(long double xx, long double *y)
4094 {
4095     // we use system tanl after reduction which
4096     // can flush denorm input to zero so
4097     // take care of it here.
4098     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
4099     {
4100         *y = cosl(xx);
4101         return xx;
4102     }
4103 
4104     double x = (double)xx;
4105     int exception;
4106     int N = payne_hanek(&x, &exception);
4107     if (exception)
4108     {
4109         *y = cosl(x);
4110         return sinl(x);
4111     }
4112     int c = N & 3;
4113     switch (c)
4114     {
4115         case 0: *y = cosl(x); return sinl(x);
4116         case 1: *y = -sinl(x); return cosl(x);
4117         case 2: *y = -cosl(x); return -sinl(x);
4118         case 3: *y = sinl(x); return -cosl(x);
4119     }
4120     return 0.0;
4121 }
4122 
reference_tanl(long double xx)4123 long double reference_tanl(long double xx)
4124 {
4125     // we use system tanl after reduction which
4126     // can flush denorm input to zero so
4127     // take care of it here.
4128     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4129 
4130     double x = (double)xx;
4131     int exception;
4132     int N = payne_hanek(&x, &exception);
4133     if (exception) return tanl(x);
4134     int c = N & 3;
4135     switch (c)
4136     {
4137         case 0: return tanl(x);
4138         case 1: return -1.0 / tanl(x);
4139         case 2: return tanl(x);
4140         case 3: return -1.0 / tanl(x);
4141     }
4142     return 0.0;
4143 }
4144 
4145 static double __loglTable1[64][3] = {
4146     { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
4147       HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
4148     { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
4149       HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
4150     { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
4151       HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
4152     { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
4153       HEX_DBL(+, 1, ceac7f0607711, -, 43) },
4154     { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
4155       HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
4156     { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
4157       HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
4158     { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
4159       HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
4160     { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
4161       HEX_DBL(+, 1, 8246f8e527754, -, 40) },
4162     { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
4163       HEX_DBL(+, 1, e1513c28e180d, -, 42) },
4164     { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
4165       HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
4166     { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
4167       HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
4168     { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
4169       HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
4170     { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
4171       HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
4172     { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
4173       HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
4174     { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
4175       HEX_DBL(+, 1, 597f8a121640f, -, 40) },
4176     { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
4177       HEX_DBL(+, 1, 02807d15580dc, -, 40) },
4178     { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
4179       HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
4180     { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
4181       HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
4182     { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
4183       HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
4184     { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
4185       HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
4186     { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
4187       HEX_DBL(+, 1, 2c83506452154, -, 42) },
4188     { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
4189       HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
4190     { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
4191       HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
4192     { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
4193       HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
4194     { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
4195       HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
4196     { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
4197       HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
4198     { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
4199       HEX_DBL(+, 1, 796f1632949c3, -, 40) },
4200     { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
4201       HEX_DBL(+, 1, 1687e151172cc, -, 40) },
4202     { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
4203       HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
4204     { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
4205       HEX_DBL(+, 1, b183a6b628487, -, 40) },
4206     { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
4207       HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
4208     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4209     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4210     { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
4211       HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
4212     { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
4213       HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
4214     { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
4215       HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
4216     { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
4217       HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
4218     { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
4219       HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
4220     { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
4221       HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
4222     { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
4223       HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
4224     { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
4225       HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
4226     { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
4227       HEX_DBL(+, 1, c383cd11362f4, -, 41) },
4228     { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
4229       HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
4230     { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
4231       HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
4232     { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
4233       HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
4234     { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
4235       HEX_DBL(+, 1, c1c161471580a, -, 40) },
4236     { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
4237       HEX_DBL(+, 1, ab1aa62214581, -, 42) },
4238     { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
4239       HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
4240     { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
4241       HEX_DBL(+, 1, 6b53fee511af, -, 42) },
4242     { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
4243       HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
4244     { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
4245       HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
4246     { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
4247       HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
4248     { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
4249       HEX_DBL(+, 1, 197e845877c94, -, 41) },
4250     { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
4251       HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
4252     { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
4253       HEX_DBL(+, 1, 031f4336644cc, -, 42) },
4254     { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
4255       HEX_DBL(+, 1, 61f33c897020c, -, 40) },
4256     { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
4257       HEX_DBL(+, 1, b2632e830632, -, 41) },
4258     { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
4259       HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
4260     { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
4261       HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
4262     { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
4263       HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
4264     { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
4265       HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
4266     { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
4267       HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
4268     { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
4269       HEX_DBL(+, 1, 26faeb9870945, -, 45) },
4270     { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
4271       HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
4272 };
4273 
4274 static double __loglTable2[64][3] = {
4275     { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
4276       HEX_DBL(+, 1, 0725e5755e314, -, 60) },
4277     { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
4278       HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
4279     { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
4280       HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
4281     { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
4282       HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
4283     { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
4284       HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
4285     { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
4286       HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
4287     { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
4288       HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
4289     { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
4290       HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
4291     { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
4292       HEX_DBL(+, 1, 63d795746f216, -, 60) },
4293     { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
4294       HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
4295     { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
4296       HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
4297     { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
4298       HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
4299     { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
4300       HEX_DBL(+, 1, c0da537429cea, -, 61) },
4301     { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
4302       HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
4303     { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
4304       HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
4305     { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
4306       HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
4307     { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
4308       HEX_DBL(+, 1, ca230e0bea645, -, 61) },
4309     { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
4310       HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
4311     { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
4312       HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
4313     { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
4314       HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
4315     { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
4316       HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
4317     { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
4318       HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
4319     { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
4320       HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
4321     { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
4322       HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
4323     { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
4324       HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
4325     { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
4326       HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
4327     { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
4328       HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
4329     { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
4330       HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
4331     { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
4332       HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
4333     { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
4334       HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
4335     { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
4336       HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
4337     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4338     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4339     { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
4340       HEX_DBL(+, 1, 95728136ae401, -, 63) },
4341     { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
4342       HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
4343     { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
4344       HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
4345     { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
4346       HEX_DBL(+, 1, daec734b11615, -, 63) },
4347     { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
4348       HEX_DBL(+, 1, 8bafca62941da, -, 62) },
4349     { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
4350       HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
4351     { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
4352       HEX_DBL(+, 1, dafce1f09937b, -, 61) },
4353     { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
4354       HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
4355     { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
4356       HEX_DBL(+, 1, 645a769232446, -, 62) },
4357     { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
4358       HEX_DBL(+, 1, 882320d368331, -, 61) },
4359     { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
4360       HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
4361     { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
4362       HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
4363     { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
4364       HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
4365     { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
4366       HEX_DBL(+, 1, c75e63370988b, -, 61) },
4367     { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
4368       HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
4369     { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
4370       HEX_DBL(+, 1, b3df952cc473c, -, 61) },
4371     { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
4372       HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
4373     { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
4374       HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
4375     { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
4376       HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
4377     { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
4378       HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
4379     { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
4380       HEX_DBL(+, 1, 21e24fc245697, -, 62) },
4381     { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
4382       HEX_DBL(+, 1, 12ef51b967102, -, 60) },
4383     { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
4384       HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
4385     { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
4386       HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
4387     { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
4388       HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
4389     { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
4390       HEX_DBL(+, 1, 70106136a8919, -, 60) },
4391     { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
4392       HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
4393     { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
4394       HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
4395     { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
4396       HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
4397     { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
4398       HEX_DBL(+, 1, c5b190c04606e, -, 62) },
4399     { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
4400       HEX_DBL(+, 1, 4723441195ac9, -, 59) }
4401 };
4402 
4403 static double __loglTable3[8][3] = {
4404     { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
4405       HEX_DBL(+, 1, a1003588d217a, -, 65) },
4406     { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
4407       HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
4408     { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
4409       HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
4410     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4411     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4412     { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
4413       HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
4414     { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
4415       HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
4416     { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
4417       HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
4418 };
4419 
__log2_ep(double * hi,double * lo,double x)4420 static void __log2_ep(double *hi, double *lo, double x)
4421 {
4422     union {
4423         uint64_t i;
4424         double d;
4425     } uu;
4426 
4427     int m;
4428     double f = reference_frexp(x, &m);
4429 
4430     // bring f in [0.75, 1.5)
4431     if (f < 0.75)
4432     {
4433         f *= 2.0;
4434         m -= 1;
4435     }
4436 
4437     // index first table .... brings down to [1-2^-7, 1+2^6)
4438     uu.d = f;
4439     int index =
4440         (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
4441     double r1 = __loglTable1[index][0];
4442     double logr1hi = __loglTable1[index][1];
4443     double logr1lo = __loglTable1[index][2];
4444     // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
4445     // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
4446     // without spilling
4447     logr1hi += m;
4448 
4449     // argument reduction needs to be in double-double since reduced argument
4450     // will form the leading term of polynomial approximation which sets the
4451     // precision we eventually achieve
4452     double zhi, zlo;
4453     MulD(&zhi, &zlo, r1, uu.d);
4454 
4455     // second index table .... brings down to [1-2^-12, 1+2^-11)
4456     uu.d = zhi;
4457     index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
4458     double r2 = __loglTable2[index][0];
4459     double logr2hi = __loglTable2[index][1];
4460     double logr2lo = __loglTable2[index][2];
4461 
4462     // reduce argument
4463     MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0);
4464 
4465     // third index table .... brings down to [1-2^-14, 1+2^-13)
4466     // Actually reduction to 2^-11 would have been sufficient to calculate
4467     // second order term in polynomial in double rather than double-double, I
4468     // reduced it a bit more to make sure other systematic arithmetic errors
4469     // are guarded against .... also this allow lower order product of leading
4470     // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
4471     // than double-double ... hence only term that needs to be done in
4472     // double-double is Ao_hi*z_hi
4473     uu.d = zhi;
4474     index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
4475     double r3 = __loglTable3[index][0];
4476     double logr3hi = __loglTable3[index][1];
4477     double logr3lo = __loglTable3[index][2];
4478 
4479     // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo))
4480     // calculate sum of first three terms ... note that m has already
4481     // been added to log2(r1)_hi
4482     double log2hi, log2lo;
4483     AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
4484     AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
4485 
4486     // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
4487     // this
4488     MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
4489     // we dont need to do full double-double substract here. substracting 1.0
4490     // for higher term is exact
4491     zhi = zhi - 1.0;
4492     // normalize
4493     AddD(&zhi, &zlo, zhi, zlo);
4494 
4495     // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
4496     // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
4497     // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
4498     // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
4499     // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
4500     // A4*z^3) since we are looking for at least 64 digits of precision and z in
4501     // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
4502     // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
4503     // double-double
4504 
4505     double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
4506     double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
4507     double y;
4508     y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
4509     y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
4510     y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
4511     y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
4512     double zhisq = zhi * zhi;
4513     y = y * zhisq;
4514     y = y + zhi * Aolo;
4515     y = y + zlo * Aohi;
4516 
4517     MulD(&zhi, &zlo, Aohi, zhi);
4518     AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
4519     AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo);
4520 
4521     *hi = zhi;
4522     *lo = zlo;
4523 }
4524 
reference_powl(long double x,long double y)4525 long double reference_powl(long double x, long double y)
4526 {
4527     // this will be used for testing doubles i.e. arguments will
4528     // be doubles so cast the input back to double ... returned
4529     // result will be long double though .... > 53 bits of precision
4530     // if platform allows.
4531     // ===========
4532     // New finding.
4533     // ===========
4534     // this function is getting used for computing reference cube root (cbrt)
4535     // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to
4536     // be double and is converted from long double to double, truncation
4537     // causes errors. So we need to tread y as long double and convert it
4538     // to hi, lo doubles when performing y*log2(x).
4539 
4540     static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
4541 
4542     // if x = 1, return x for any y, even NaN
4543     if (x == 1.0) return x;
4544 
4545     // if y == 0, return 1 for any x, even NaN
4546     if (y == 0.0) return 1.0L;
4547 
4548     // get NaNs out of the way
4549     if (x != x || y != y) return x + y;
4550 
4551     // do the work required to sort out edge cases
4552     double fabsy = reference_fabs(y);
4553     double fabsx = reference_fabs(x);
4554     double iy = reference_rint(
4555         fabsy); // we do round to nearest here so that |fy| <= 0.5
4556     if (iy > fabsy) // convert nearbyint to floor
4557         iy -= 1.0;
4558     int isOddInt = 0;
4559     if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
4560         isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
4561 
4562     /// test a few more edge cases
4563     // deal with x == 0 cases
4564     if (x == 0.0)
4565     {
4566         if (!isOddInt) x = 0.0;
4567 
4568         if (y < 0) x = 1.0 / x;
4569 
4570         return x;
4571     }
4572 
4573     // x == +-Inf cases
4574     if (isinf(fabsx))
4575     {
4576         if (x < 0)
4577         {
4578             if (isOddInt)
4579             {
4580                 if (y < 0)
4581                     return -0.0;
4582                 else
4583                     return -INFINITY;
4584             }
4585             else
4586             {
4587                 if (y < 0)
4588                     return 0.0;
4589                 else
4590                     return INFINITY;
4591             }
4592         }
4593 
4594         if (y < 0) return 0;
4595         return INFINITY;
4596     }
4597 
4598     // y = +-inf cases
4599     if (isinf(fabsy))
4600     {
4601         if (x == -1) return 1;
4602 
4603         if (y < 0)
4604         {
4605             if (fabsx < 1) return INFINITY;
4606             return 0;
4607         }
4608         if (fabsx < 1) return 0;
4609         return INFINITY;
4610     }
4611 
4612     // x < 0 and y non integer case
4613     if (x < 0 && iy != fabsy)
4614     {
4615         // return nan;
4616         return cl_make_nan();
4617     }
4618 
4619     // speedy resolution of sqrt and reciprocal sqrt
4620     if (fabsy == 0.5)
4621     {
4622         long double xl = sqrtl(x);
4623         if (y < 0) xl = 1.0 / xl;
4624         return xl;
4625     }
4626 
4627     double log2x_hi, log2x_lo;
4628 
4629     // extended precision log .... accurate to at least 64-bits + couple of
4630     // guard bits
4631     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
4632 
4633     double ylog2x_hi, ylog2x_lo;
4634 
4635     double y_hi = (double)y;
4636     double y_lo = (double)(y - (long double)y_hi);
4637 
4638     // compute product of y*log2(x)
4639     // scale to avoid overflow in double-double multiplication
4640     if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970))
4641     {
4642         y_hi = reference_ldexp(y_hi, -53);
4643         y_lo = reference_ldexp(y_lo, -53);
4644     }
4645     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
4646     if (fabs(y) > HEX_DBL(+, 1, 0, +, 970))
4647     {
4648         ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
4649         ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
4650     }
4651 
4652     long double powxy;
4653     if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
4654     {
4655         powxy =
4656             reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
4657     }
4658     else
4659     {
4660         // separate integer + fractional part
4661         long int m = lrint(ylog2x_hi);
4662         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
4663 
4664         // revert to long double arithemtic
4665         long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
4666         long double tmp = reference_exp2l(ylog2x);
4667         powxy = reference_scalblnl(tmp, m);
4668     }
4669 
4670     // if y is odd integer and x is negative, reverse sign
4671     if (isOddInt & reference_signbit(x)) powxy = -powxy;
4672     return powxy;
4673 }
4674 
reference_nextafter(double xx,double yy)4675 double reference_nextafter(double xx, double yy)
4676 {
4677     float x = (float)xx;
4678     float y = (float)yy;
4679 
4680     // take care of nans
4681     if (x != x) return x;
4682 
4683     if (y != y) return y;
4684 
4685     if (x == y) return y;
4686 
4687     int32f_t a, b;
4688 
4689     a.f = x;
4690     b.f = y;
4691 
4692     if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
4693     if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
4694 
4695     a.i += (a.i < b.i) ? 1 : -1;
4696     a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
4697 
4698     return a.f;
4699 }
4700 
4701 
reference_nextafterl(long double xx,long double yy)4702 long double reference_nextafterl(long double xx, long double yy)
4703 {
4704     double x = (double)xx;
4705     double y = (double)yy;
4706 
4707     // take care of nans
4708     if (x != x) return x;
4709 
4710     if (y != y) return y;
4711 
4712     int64d_t a, b;
4713 
4714     a.d = x;
4715     b.d = y;
4716 
4717     int64_t tmp = 0x8000000000000000LL;
4718 
4719     if (a.l & tmp) a.l = tmp - a.l;
4720     if (b.l & tmp) b.l = tmp - b.l;
4721 
4722     // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
4723     // 0.0f) test needs to be done using integer rep because subnormals may be
4724     // flushed to zero on some platforms
4725     if (a.l == b.l) return y;
4726 
4727     a.l += (a.l < b.l) ? 1 : -1;
4728     a.l = (a.l < 0) ? tmp - a.l : a.l;
4729 
4730     return a.d;
4731 }
4732 
reference_fdim(double xx,double yy)4733 double reference_fdim(double xx, double yy)
4734 {
4735     float x = (float)xx;
4736     float y = (float)yy;
4737 
4738     if (x != x) return x;
4739 
4740     if (y != y) return y;
4741 
4742     float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
4743     return r;
4744 }
4745 
4746 
reference_fdiml(long double xx,long double yy)4747 long double reference_fdiml(long double xx, long double yy)
4748 {
4749     double x = (double)xx;
4750     double y = (double)yy;
4751 
4752     if (x != x) return x;
4753 
4754     if (y != y) return y;
4755 
4756     double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
4757     return r;
4758 }
4759 
reference_remquo(double xd,double yd,int * n)4760 double reference_remquo(double xd, double yd, int *n)
4761 {
4762     float xx = (float)xd;
4763     float yy = (float)yd;
4764 
4765     if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
4766     {
4767         *n = 0;
4768         return cl_make_nan();
4769     }
4770 
4771     if (fabsf(yy) == INFINITY || xx == 0.0f)
4772     {
4773         *n = 0;
4774         return xd;
4775     }
4776 
4777     if (fabsf(xx) == fabsf(yy))
4778     {
4779         *n = (xx == yy) ? 1 : -1;
4780         return reference_signbit(xx) ? -0.0 : 0.0;
4781     }
4782 
4783     int signx = reference_signbit(xx) ? -1 : 1;
4784     int signy = reference_signbit(yy) ? -1 : 1;
4785     int signn = (signx == signy) ? 1 : -1;
4786     float x = fabsf(xx);
4787     float y = fabsf(yy);
4788 
4789     int ex, ey;
4790     ex = reference_ilogb(x);
4791     ey = reference_ilogb(y);
4792     float xr = x;
4793     float yr = y;
4794     uint32_t q = 0;
4795 
4796     if (ex - ey >= -1)
4797     {
4798 
4799         yr = (float)reference_ldexp(y, -ey);
4800         xr = (float)reference_ldexp(x, -ex);
4801 
4802         if (ex - ey >= 0)
4803         {
4804             int i;
4805             for (i = ex - ey; i > 0; i--)
4806             {
4807                 q <<= 1;
4808                 if (xr >= yr)
4809                 {
4810                     xr -= yr;
4811                     q += 1;
4812                 }
4813                 xr += xr;
4814             }
4815             q <<= 1;
4816             if (xr > yr)
4817             {
4818                 xr -= yr;
4819                 q += 1;
4820             }
4821         }
4822         else // ex-ey = -1
4823             xr = reference_ldexp(xr, ex - ey);
4824     }
4825 
4826     if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
4827     {
4828         xr -= yr;
4829         q += 1;
4830     }
4831 
4832     if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4833 
4834     int qout = q & 0x0000007f;
4835     if (signn < 0) qout = -qout;
4836     if (xx < 0.0) xr = -xr;
4837 
4838     *n = qout;
4839 
4840     return xr;
4841 }
4842 
reference_remquol(long double xd,long double yd,int * n)4843 long double reference_remquol(long double xd, long double yd, int *n)
4844 {
4845     double xx = (double)xd;
4846     double yy = (double)yd;
4847 
4848     if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
4849     {
4850         *n = 0;
4851         return cl_make_nan();
4852     }
4853 
4854     if (reference_fabs(yy) == INFINITY || xx == 0.0)
4855     {
4856         *n = 0;
4857         return xd;
4858     }
4859 
4860     if (reference_fabs(xx) == reference_fabs(yy))
4861     {
4862         *n = (xx == yy) ? 1 : -1;
4863         return reference_signbit(xx) ? -0.0 : 0.0;
4864     }
4865 
4866     int signx = reference_signbit(xx) ? -1 : 1;
4867     int signy = reference_signbit(yy) ? -1 : 1;
4868     int signn = (signx == signy) ? 1 : -1;
4869     double x = reference_fabs(xx);
4870     double y = reference_fabs(yy);
4871 
4872     int ex, ey;
4873     ex = reference_ilogbl(x);
4874     ey = reference_ilogbl(y);
4875     double xr = x;
4876     double yr = y;
4877     uint32_t q = 0;
4878 
4879     if (ex - ey >= -1)
4880     {
4881         yr = reference_ldexp(y, -ey);
4882         xr = reference_ldexp(x, -ex);
4883         int i;
4884 
4885         if (ex - ey >= 0)
4886         {
4887             for (i = ex - ey; i > 0; i--)
4888             {
4889                 q <<= 1;
4890                 if (xr >= yr)
4891                 {
4892                     xr -= yr;
4893                     q += 1;
4894                 }
4895                 xr += xr;
4896             }
4897             q <<= 1;
4898             if (xr > yr)
4899             {
4900                 xr -= yr;
4901                 q += 1;
4902             }
4903         }
4904         else
4905             xr = reference_ldexp(xr, ex - ey);
4906     }
4907 
4908     if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
4909     {
4910         xr -= yr;
4911         q += 1;
4912     }
4913 
4914     if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4915 
4916     int qout = q & 0x0000007f;
4917     if (signn < 0) qout = -qout;
4918     if (xx < 0.0) xr = -xr;
4919 
4920     *n = qout;
4921     return xr;
4922 }
4923 
reference_scalbn(double x,int n)4924 static double reference_scalbn(double x, int n)
4925 {
4926     if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
4927 
4928     int bias = 1023;
4929     union {
4930         double d;
4931         cl_long l;
4932     } u;
4933     u.d = (double)x;
4934     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
4935     if (e == 0)
4936     {
4937         u.l |= ((cl_long)1023 << 52);
4938         u.d -= 1.0;
4939         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
4940     }
4941     e += n;
4942     if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
4943     if (e < -51 || n < -2097) return reference_copysign(0.0, x);
4944     if (e <= 0)
4945     {
4946         bias += (e - 1);
4947         e = 1;
4948     }
4949     u.l &= 0x800fffffffffffffLL;
4950     u.l |= ((cl_long)e << 52);
4951     x = u.d;
4952     u.l = ((cl_long)bias << 52);
4953     return x * u.d;
4954 }
4955 
reference_scalblnl(long double x,long n)4956 static long double reference_scalblnl(long double x, long n)
4957 {
4958 #if defined(__i386__) || defined(__x86_64__) // INTEL
4959     union {
4960         long double d;
4961         struct
4962         {
4963             cl_ulong m;
4964             cl_ushort sexp;
4965         } u;
4966     } u;
4967     u.u.m = CL_LONG_MIN;
4968 
4969     if (reference_isinf(x)) return x;
4970 
4971     if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
4972 
4973     if (n > 2200) return reference_copysignl(INFINITY, x);
4974 
4975     if (n < 0)
4976     {
4977         u.u.sexp = 0x3fff - 1022;
4978         while (n <= -1022)
4979         {
4980             x *= u.d;
4981             n += 1022;
4982         }
4983         u.u.sexp = 0x3fff + n;
4984         x *= u.d;
4985         return x;
4986     }
4987 
4988     if (n > 0)
4989     {
4990         u.u.sexp = 0x3fff + 1023;
4991         while (n >= 1023)
4992         {
4993             x *= u.d;
4994             n -= 1023;
4995         }
4996         u.u.sexp = 0x3fff + n;
4997         x *= u.d;
4998         return x;
4999     }
5000 
5001     return x;
5002 
5003 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
5004 
5005 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
5006     if (reference_isinfl(x) || reference_isnanl(x)) return x;
5007 
5008     int bias = 1023;
5009     union {
5010         double d;
5011         cl_long l;
5012     } u;
5013     u.d = (double)x;
5014     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
5015     if (e == 0)
5016     {
5017         u.l |= ((cl_long)1023 << 52);
5018         u.d -= 1.0;
5019         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
5020     }
5021     e += n;
5022     if (e >= 2047) return reference_copysignl(INFINITY, x);
5023     if (e < -51) return reference_copysignl(0.0, x);
5024     if (e <= 0)
5025     {
5026         bias += (e - 1);
5027         e = 1;
5028     }
5029     u.l &= 0x800fffffffffffffLL;
5030     u.l |= ((cl_long)e << 52);
5031     x = u.d;
5032     u.l = ((cl_long)bias << 52);
5033     return x * u.d;
5034 #endif
5035 
5036 #else // PPC
5037     return scalblnl(x, n);
5038 #endif
5039 }
5040 
reference_relaxed_exp(double x)5041 double reference_relaxed_exp(double x) { return reference_exp(x); }
5042 
reference_exp(double x)5043 double reference_exp(double x)
5044 {
5045     return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
5046 }
5047 
reference_expl(long double x)5048 long double reference_expl(long double x)
5049 {
5050 #if defined(__PPC__)
5051     long double scale, bias;
5052 
5053     // The PPC double long version of expl fails to produce denorm results
5054     // and instead generates a 0.0. Compensate for this limitation by
5055     // computing expl as:
5056     //     expl(x + 40) * expl(-40)
5057     // Likewise, overflows can prematurely produce an infinity, so we
5058     // compute expl as:
5059     //     expl(x - 40) * expl(40)
5060     scale = 1.0L;
5061     bias = 0.0L;
5062     if (x < -708.0L)
5063     {
5064         bias = 40.0;
5065         scale = expl(-40.0L);
5066     }
5067     else if (x > 708.0L)
5068     {
5069         bias = -40.0L;
5070         scale = expl(40.0L);
5071     }
5072     return expl(x + bias) * scale;
5073 #else
5074     return expl(x);
5075 #endif
5076 }
5077 
reference_sinh(double x)5078 double reference_sinh(double x) { return sinh(x); }
5079 
reference_sinhl(long double x)5080 long double reference_sinhl(long double x) { return sinhl(x); }
5081 
reference_fmod(double x,double y)5082 double reference_fmod(double x, double y)
5083 {
5084     if (x == 0.0 && fabs(y) > 0.0) return x;
5085 
5086     if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
5087 
5088     if (fabs(y) == INFINITY) // we know x is finite from above
5089         return x;
5090 #if defined(_MSC_VER) && defined(_M_X64)
5091     return fmod(x, y);
5092 #else
5093     return fmodf((float)x, (float)y);
5094 #endif
5095 }
5096 
reference_fmodl(long double x,long double y)5097 long double reference_fmodl(long double x, long double y)
5098 {
5099     if (x == 0.0L && fabsl(y) > 0.0L) return x;
5100 
5101     if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
5102 
5103     if (fabsl(y) == INFINITY) // we know x is finite from above
5104         return x;
5105 
5106     return fmod((double)x, (double)y);
5107 }
5108 
reference_modf(double x,double * n)5109 double reference_modf(double x, double *n)
5110 {
5111     if (isnan(x))
5112     {
5113         *n = cl_make_nan();
5114         return cl_make_nan();
5115     }
5116     float nr;
5117     float yr = modff((float)x, &nr);
5118     *n = nr;
5119     return yr;
5120 }
5121 
reference_modfl(long double x,long double * n)5122 long double reference_modfl(long double x, long double *n)
5123 {
5124     if (isnan(x))
5125     {
5126         *n = cl_make_nan();
5127         return cl_make_nan();
5128     }
5129     double nr;
5130     double yr = modf((double)x, &nr);
5131     *n = nr;
5132     return yr;
5133 }
5134 
reference_fractl(long double x,long double * ip)5135 long double reference_fractl(long double x, long double *ip)
5136 {
5137     if (isnan(x))
5138     {
5139         *ip = cl_make_nan();
5140         return cl_make_nan();
5141     }
5142 
5143     double i;
5144     double f = modf((double)x, &i);
5145     if (f < 0.0)
5146     {
5147         f = 1.0 + f;
5148         i -= 1.0;
5149         if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
5150     }
5151     *ip = i;
5152     return f;
5153 }
5154 
reference_fabsl(long double x)5155 long double reference_fabsl(long double x) { return fabsl(x); }
5156 
reference_relaxed_log(double x)5157 double reference_relaxed_log(double x)
5158 {
5159     return (float)reference_log((float)x);
5160 }
5161 
reference_log(double x)5162 double reference_log(double x)
5163 {
5164     if (x == 0.0) return -INFINITY;
5165 
5166     if (x < 0.0) return cl_make_nan();
5167 
5168     if (isinf(x)) return INFINITY;
5169 
5170     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5171     double logxHi, logxLo;
5172     __log2_ep(&logxHi, &logxLo, x);
5173     return logxHi * log2Hi;
5174 }
5175 
reference_logl(long double x)5176 long double reference_logl(long double x)
5177 {
5178     if (x == 0.0) return -INFINITY;
5179 
5180     if (x < 0.0) return cl_make_nan();
5181 
5182     if (isinf(x)) return INFINITY;
5183 
5184     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5185     double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
5186     double logxHi, logxLo;
5187     __log2_ep(&logxHi, &logxLo, x);
5188 
5189     long double lg2 = (long double)log2Hi + (long double)log2Lo;
5190     long double logx = (long double)logxHi + (long double)logxLo;
5191     return logx * lg2;
5192 }
5193 
reference_relaxed_pow(double x,double y)5194 double reference_relaxed_pow(double x, double y)
5195 {
5196     return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
5197 }
5198 
reference_pow(double x,double y)5199 double reference_pow(double x, double y)
5200 {
5201     static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
5202 
5203     // if x = 1, return x for any y, even NaN
5204     if (x == 1.0) return x;
5205 
5206     // if y == 0, return 1 for any x, even NaN
5207     if (y == 0.0) return 1.0;
5208 
5209     // get NaNs out of the way
5210     if (x != x || y != y) return x + y;
5211 
5212     // do the work required to sort out edge cases
5213     double fabsy = reference_fabs(y);
5214     double fabsx = reference_fabs(x);
5215     double iy = reference_rint(
5216         fabsy); // we do round to nearest here so that |fy| <= 0.5
5217     if (iy > fabsy) // convert nearbyint to floor
5218         iy -= 1.0;
5219     int isOddInt = 0;
5220     if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
5221         isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
5222 
5223     /// test a few more edge cases
5224     // deal with x == 0 cases
5225     if (x == 0.0)
5226     {
5227         if (!isOddInt) x = 0.0;
5228 
5229         if (y < 0) x = 1.0 / x;
5230 
5231         return x;
5232     }
5233 
5234     // x == +-Inf cases
5235     if (isinf(fabsx))
5236     {
5237         if (x < 0)
5238         {
5239             if (isOddInt)
5240             {
5241                 if (y < 0)
5242                     return -0.0;
5243                 else
5244                     return -INFINITY;
5245             }
5246             else
5247             {
5248                 if (y < 0)
5249                     return 0.0;
5250                 else
5251                     return INFINITY;
5252             }
5253         }
5254 
5255         if (y < 0) return 0;
5256         return INFINITY;
5257     }
5258 
5259     // y = +-inf cases
5260     if (isinf(fabsy))
5261     {
5262         if (x == -1) return 1;
5263 
5264         if (y < 0)
5265         {
5266             if (fabsx < 1) return INFINITY;
5267             return 0;
5268         }
5269         if (fabsx < 1) return 0;
5270         return INFINITY;
5271     }
5272 
5273     // x < 0 and y non integer case
5274     if (x < 0 && iy != fabsy)
5275     {
5276         // return nan;
5277         return cl_make_nan();
5278     }
5279 
5280     // speedy resolution of sqrt and reciprocal sqrt
5281     if (fabsy == 0.5)
5282     {
5283         long double xl = reference_sqrt(x);
5284         if (y < 0) xl = 1.0 / xl;
5285         return xl;
5286     }
5287 
5288     double hi, lo;
5289     __log2_ep(&hi, &lo, fabsx);
5290     double prod = y * hi;
5291     double result = reference_exp2(prod);
5292     return isOddInt ? reference_copysignd(result, x) : result;
5293 }
5294 
reference_sqrt(double x)5295 double reference_sqrt(double x) { return sqrt(x); }
5296 
reference_floor(double x)5297 double reference_floor(double x) { return floorf((float)x); }
5298 
reference_ldexp(double value,int exponent)5299 double reference_ldexp(double value, int exponent)
5300 {
5301 #ifdef __MINGW32__
5302     /*
5303      * ====================================================
5304      * This function is from fdlibm: http://www.netlib.org
5305      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
5306      *
5307      * Developed at SunSoft, a Sun Microsystems, Inc. business.
5308      * Permission to use, copy, modify, and distribute this
5309      * software is freely granted, provided that this notice
5310      * is preserved.
5311      * ====================================================
5312      */
5313     if (!finite(value) || value == 0.0) return value;
5314     return scalbn(value, exponent);
5315 #else
5316     return reference_scalbn(value, exponent);
5317 #endif
5318 }
5319 
reference_ldexpl(long double x,int n)5320 long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
5321 
reference_coshl(long double x)5322 long double reference_coshl(long double x) { return coshl(x); }
5323 
reference_ceil(double x)5324 double reference_ceil(double x) { return ceilf((float)x); }
5325 
reference_ceill(long double x)5326 long double reference_ceill(long double x)
5327 {
5328     if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5329 
5330     long double absx = reference_fabsl(x);
5331     if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5332 
5333     if (absx < 1.0)
5334     {
5335         if (x < 0.0)
5336             return 0.0;
5337         else
5338             return 1.0;
5339     }
5340 
5341     long double r = (long double)((cl_long)x);
5342 
5343     if (x > 0.0 && r < x) r += 1.0;
5344 
5345     return r;
5346 }
5347 
5348 
reference_acosl(long double x)5349 long double reference_acosl(long double x)
5350 {
5351     long double x2 = x * x;
5352     int i;
5353 
5354     // Prepare a head + tail representation of PI in long double.  A good
5355     // compiler should get rid of all of this work.
5356     static const cl_ulong pi_bits[2] = {
5357         0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
5358     }; // first 126 bits of pi
5359        // http://www.super-computing.org/pi-hexa_current.html
5360     long double head, tail, temp;
5361 #if __LDBL_MANT_DIG__ >= 64
5362     // long double has 64-bits of precision or greater
5363     temp = (long double)pi_bits[0] * 0x1.0p64L;
5364     head = temp + (long double)pi_bits[1];
5365     temp -= head; // rounding err rounding pi_bits[1] into head
5366     tail = (long double)pi_bits[1] + temp;
5367     head *= HEX_LDBL(+, 1, 0, -, 125);
5368     tail *= HEX_LDBL(+, 1, 0, -, 125);
5369 #else
5370     head = (long double)pi_bits[0];
5371     tail =
5372         (long double)((cl_long)pi_bits[0]
5373                       - (cl_long)
5374                           head); // residual part of pi_bits[0] after rounding
5375     tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
5376     head *= HEX_LDBL(+, 1, 0, -, 61);
5377     tail *= HEX_LDBL(+, 1, 0, -, 125);
5378 #endif
5379 
5380     // oversize values and NaNs go to NaN
5381     if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
5382 
5383     //
5384     // deal with large |x|:
5385     //                                                      sqrt( 1 - x**2)
5386     // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;
5387     // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
5388     //                                                          1 + x
5389     if (x2 > 0.5)
5390     {
5391         // we handle the x < 0 case as pi - acos(|x|)
5392 
5393         long double sign = reference_copysignl(1.0L, x);
5394         long double fabsx = reference_fabsl(x);
5395         head -= head * sign; // x > 0 ? 0 : pi.hi
5396         tail -= tail * sign; // x > 0 ? 0 : pi.low
5397 
5398         // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
5399         // (1-x)/(1+x) )
5400         long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
5401         long double z = sign * sqrtl(z2);
5402 
5403         //                     atan(sqrt(q))
5404         // Minimax fit p(x) = ---------------- - 1
5405         //                        sqrt(q)
5406         //
5407         // Define q = r*r, and solve for atan(r):
5408         //
5409         //  atan(r) = (p(r) + 1) * r = rp(r) + r
5410         static long double atan_coeffs[] = {
5411             HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
5412             HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
5413             HEX_LDBL(+, c, ccccccccc992407, -, 6),
5414             HEX_LDBL(-, 9, 24924923024398, -, 6),
5415             HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
5416             HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
5417             HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
5418             HEX_LDBL(-, 8, 88580517884c547, -, 7),
5419             HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
5420             HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
5421             HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
5422             HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
5423             HEX_LDBL(+, e, d67c16582123937, -, 10)
5424         }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:
5425            // 0x1.67ea5c184e5d9p-64
5426 
5427         // Calculate y = p(r)
5428         const size_t atan_coeff_count =
5429             sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
5430         long double y = atan_coeffs[atan_coeff_count - 1];
5431         for (i = (int)atan_coeff_count - 2; i >= 0; i--)
5432             y = atan_coeffs[i] + y * z2;
5433 
5434         z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
5435         y *= z; // rp(r)
5436 
5437         return head + ((y + tail) + z);
5438     }
5439 
5440     // do |x| <= sqrt(0.5) here
5441     //                                                     acos( sqrt(z) ) -
5442     //                                                     PI/2
5443     //  Piecewise minimax polynomial fits for p(z) = 1 +
5444     //  ------------------------;
5445     //                                                            sqrt(z)
5446     //
5447     //  Define z = x*x, and solve for acos(x) over x in  x >= 0:
5448     //
5449     //      acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
5450     //
5451     const long double coeffs[4][14] = {
5452         { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
5453           HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
5454           HEX_LDBL(-, a, c526184bd558c17, -, 7),
5455           HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
5456           HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
5457           HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
5458           HEX_LDBL(-, c, c71b00479541d8e, -, 9),
5459           HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
5460           HEX_LDBL(-, 9, a93060051f48d14, -, 9),
5461           HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
5462           HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
5463           HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
5464           HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
5465           HEX_LDBL(-, d, 6a3851330e15526, -,
5466                    10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
5467                           // Error: 0x1.97839bf07024p-76
5468 
5469         { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
5470           HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
5471           HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
5472           HEX_LDBL(-, a, 086fafac220f16b, -, 7),
5473           HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
5474           HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
5475           HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
5476           HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
5477           HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
5478           HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
5479           HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
5480           HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
5481           HEX_LDBL(-, 9, 4468907f007727, -, 7),
5482           HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
5483                    7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error:
5484                          // 0x1.647af70073457p-73
5485 
5486         { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
5487           HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
5488           HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
5489           HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
5490           HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
5491           HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
5492           HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
5493           HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
5494           HEX_LDBL(-, c, b673b12794edb24, -, 6),
5495           HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
5496           HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
5497           HEX_LDBL(-, c, 461e725765a7581, -, 5),
5498           HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
5499           HEX_LDBL(-, a, 6de6cbd96c80562, -,
5500                    4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error:
5501                          // 0x1.b0246c304ce1ap-70
5502 
5503         { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
5504           HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
5505           HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
5506           HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
5507           HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
5508           HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
5509           HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
5510           HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
5511           HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
5512           HEX_LDBL(-, a, dbd760a1882da48, -, 3),
5513           HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
5514           HEX_LDBL(-, c, c05d7730963e793, -, 2),
5515           HEX_LDBL(-, a, 523d97197cd124a, -, 1),
5516           HEX_LDBL(-, 8, 307ba943978aaee, +,
5517                    0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error:
5518                         // 0x1.9ecff73da69c9p-66
5519     };
5520 
5521     const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
5522     const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
5523 
5524     // reduce the incoming values a bit so that they are in the range
5525     // [-0x1.0p-4, 0x1.0p-4]
5526     const long double *c;
5527     i = x2 * 8.0L;
5528     c = coeffs[i];
5529     x2 -= offsets[i]; // exact
5530 
5531     // calcualte p(x2)
5532     long double y = c[coeff_count - 1];
5533     for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
5534 
5535     // xp(x2)
5536     y *= x;
5537 
5538     // return xp(x2) - x + PI/2
5539     return head + ((y + tail) - x);
5540 }
5541 
reference_relaxed_acos(double x)5542 double reference_relaxed_acos(double x) { return reference_acos(x); }
5543 
reference_log10(double x)5544 double reference_log10(double x)
5545 {
5546     if (x == 0.0) return -INFINITY;
5547 
5548     if (x < 0.0) return cl_make_nan();
5549 
5550     if (isinf(x)) return INFINITY;
5551 
5552     double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5553     double logxHi, logxLo;
5554     __log2_ep(&logxHi, &logxLo, x);
5555     return logxHi * log2Hi;
5556 }
5557 
reference_relaxed_log10(double x)5558 double reference_relaxed_log10(double x) { return reference_log10(x); }
5559 
reference_log10l(long double x)5560 long double reference_log10l(long double x)
5561 {
5562     if (x == 0.0) return -INFINITY;
5563 
5564     if (x < 0.0) return cl_make_nan();
5565 
5566     if (isinf(x)) return INFINITY;
5567 
5568     double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5569     double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
5570     double logxHi, logxLo;
5571     __log2_ep(&logxHi, &logxLo, x);
5572 
5573     long double lg2 = (long double)log2Hi + (long double)log2Lo;
5574     long double logx = (long double)logxHi + (long double)logxLo;
5575     return logx * lg2;
5576 }
5577 
reference_acos(double x)5578 double reference_acos(double x) { return acos(x); }
5579 
reference_atan2(double x,double y)5580 double reference_atan2(double x, double y)
5581 {
5582 #if defined(_WIN32)
5583     // fix edge cases for Windows
5584     if (isinf(x) && isinf(y))
5585     {
5586         double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5587         return (x > 0) ? retval : -retval;
5588     }
5589 #endif // _WIN32
5590     return atan2(x, y);
5591 }
5592 
reference_atan2l(long double x,long double y)5593 long double reference_atan2l(long double x, long double y)
5594 {
5595 #if defined(_WIN32)
5596     // fix edge cases for Windows
5597     if (isinf(x) && isinf(y))
5598     {
5599         long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5600         return (x > 0) ? retval : -retval;
5601     }
5602 #endif // _WIN32
5603     return atan2l(x, y);
5604 }
5605 
reference_frexp(double a,int * exp)5606 double reference_frexp(double a, int *exp)
5607 {
5608     if (isnan(a) || isinf(a) || a == 0.0)
5609     {
5610         *exp = 0;
5611         return a;
5612     }
5613 
5614     union {
5615         cl_double d;
5616         cl_ulong l;
5617     } u;
5618 
5619     u.d = a;
5620 
5621     // separate out sign
5622     cl_ulong s = u.l & 0x8000000000000000ULL;
5623     u.l &= 0x7fffffffffffffffULL;
5624     int bias = -1022;
5625 
5626     if ((u.l & 0x7ff0000000000000ULL) == 0)
5627     {
5628         double d = u.l;
5629         u.d = d;
5630         bias -= 1074;
5631     }
5632 
5633     int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52);
5634     u.l &= 0x000fffffffffffffULL;
5635     e += bias;
5636     u.l |= ((cl_ulong)1022 << 52);
5637     u.l |= s;
5638 
5639     *exp = e;
5640     return u.d;
5641 }
5642 
reference_frexpl(long double a,int * exp)5643 long double reference_frexpl(long double a, int *exp)
5644 {
5645     if (isnan(a) || isinf(a) || a == 0.0)
5646     {
5647         *exp = 0;
5648         return a;
5649     }
5650 
5651     if (sizeof(long double) == sizeof(double))
5652     {
5653         return reference_frexp(a, exp);
5654     }
5655     else
5656     {
5657         return frexpl(a, exp);
5658     }
5659 }
5660 
5661 
reference_atan(double x)5662 double reference_atan(double x) { return atan(x); }
5663 
reference_atanl(long double x)5664 long double reference_atanl(long double x) { return atanl(x); }
5665 
reference_asinl(long double x)5666 long double reference_asinl(long double x) { return asinl(x); }
5667 
reference_asin(double x)5668 double reference_asin(double x) { return asin(x); }
5669 
reference_relaxed_asin(double x)5670 double reference_relaxed_asin(double x) { return reference_asin(x); }
5671 
reference_fabs(double x)5672 double reference_fabs(double x) { return fabs(x); }
5673 
reference_cosh(double x)5674 double reference_cosh(double x) { return cosh(x); }
5675 
reference_sqrtl(long double x)5676 long double reference_sqrtl(long double x)
5677 {
5678 #if defined(__SSE2__)                                                          \
5679     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
5680     __m128d result128 = _mm_set_sd((double)x);
5681     result128 = _mm_sqrt_sd(result128, result128);
5682     return _mm_cvtsd_f64(result128);
5683 #else
5684     volatile double dx = x;
5685     return sqrt(dx);
5686 #endif
5687 }
5688 
reference_tanhl(long double x)5689 long double reference_tanhl(long double x) { return tanhl(x); }
5690 
reference_floorl(long double x)5691 long double reference_floorl(long double x)
5692 {
5693     if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5694 
5695     long double absx = reference_fabsl(x);
5696     if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5697 
5698     if (absx < 1.0)
5699     {
5700         if (x < 0.0)
5701             return -1.0;
5702         else
5703             return 0.0;
5704     }
5705 
5706     long double r = (long double)((cl_long)x);
5707 
5708     if (x < 0.0 && r > x) r -= 1.0;
5709 
5710     return r;
5711 }
5712 
5713 
reference_tanh(double x)5714 double reference_tanh(double x) { return tanh(x); }
5715 
reference_assignmentl(long double x)5716 long double reference_assignmentl(long double x) { return x; }
5717 
reference_notl(long double x)5718 int reference_notl(long double x)
5719 {
5720     int r = !x;
5721     return r;
5722 }
5723