1 /*
2  * e_rem_pio2.c
3  *
4  * Copyright (c) 1999-2018, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <math.h>
9 #include "math_private.h"
10 
__ieee754_rem_pio2(double x,double * y)11 int __ieee754_rem_pio2(double x, double *y) {
12     int q;
13 
14     y[1] = 0.0;                        /* default */
15 
16     /*
17      * Simple cases: all nicked from the fdlibm version for speed.
18      */
19     {
20         static const double invpio2 =  0x1.45f306dc9c883p-1;
21         static const double pio2s[] = {
22             0x1.921fb544p+0, /* 1.57079632673412561417e+00 */
23             0x1.0b4611a626331p-34, /* 6.07710050650619224932e-11 */
24             0x1.0b4611a6p-34, /* 6.07710050630396597660e-11 */
25             0x1.3198a2e037073p-69, /* 2.02226624879595063154e-21 */
26             0x1.3198a2ep-69, /* 2.02226624871116645580e-21 */
27             0x1.b839a252049c1p-104, /* 8.47842766036889956997e-32 */
28         };
29 
30         double z,w,t,r,fn;
31         int i,j,idx,n,ix,hx;
32 
33         hx = __HI(x);           /* high word of x */
34         ix = hx&0x7fffffff;
35         if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
36             {y[0] = x; y[1] = 0; return 0;}
37         if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
38             if(hx>0) {
39                 z = x - pio2s[0];
40                 if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
41                     y[0] = z - pio2s[1];
42 #ifdef bottomhalf
43                     y[1] = (z-y[0])-pio2s[1];
44 #endif
45                 } else {                /* near pi/2, use 33+33+53 bit pi */
46                     z -= pio2s[2];
47                     y[0] = z - pio2s[3];
48 #ifdef bottomhalf
49                     y[1] = (z-y[0])-pio2s[3];
50 #endif
51                 }
52                 return 1;
53             } else {    /* negative x */
54                 z = x + pio2s[0];
55                 if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
56                     y[0] = z + pio2s[1];
57 #ifdef bottomhalf
58                     y[1] = (z-y[0])+pio2s[1];
59 #endif
60                 } else {                /* near pi/2, use 33+33+53 bit pi */
61                     z += pio2s[2];
62                     y[0] = z + pio2s[3];
63 #ifdef bottomhalf
64                     y[1] = (z-y[0])+pio2s[3];
65 #endif
66                 }
67                 return -1;
68             }
69         }
70         if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
71             t  = fabs(x);
72             n  = (int) (t*invpio2+0.5);
73             fn = (double)n;
74             r  = t-fn*pio2s[0];
75             w  = fn*pio2s[1];    /* 1st round good to 85 bit */
76             j  = ix>>20;
77             idx = 1;
78             while (1) {
79                 y[0] = r-w;
80                 if (idx == 3)
81                     break;
82                 i = j-(((__HI(y[0]))>>20)&0x7ff);
83                 if(i <= 33*idx-17)
84                     break;
85                 t  = r;
86                 w  = fn*pio2s[2*idx];
87                 r  = t-w;
88                 w  = fn*pio2s[2*idx+1]-((t-r)-w);
89                 idx++;
90             }
91 #ifdef bottomhalf
92             y[1] = (r-y[0])-w;
93 #endif
94             if(hx<0) {
95                 y[0] = -y[0];
96 #ifdef bottomhalf
97                 y[1] = -y[1];
98 #endif
99                 return -n;
100             } else {
101                 return n;
102             }
103         }
104     }
105 
106     {
107         static const unsigned twooverpi[] = {
108             /* We start with two zero words, because they take up less
109              * space than the array bounds checking and special-case
110              * handling that would have to occur in their absence. */
111             0, 0,
112             /* 2/pi in hex is 0.a2f9836e... */
113             0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599,
114             0x3c439041, 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0,
115             0x06492eea, 0x09d1921c, 0xfe1deb1c, 0xb129a73e, 0xe88235f5,
116             0x2ebb4484, 0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4,
117             0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 0xef2f118b,
118             0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
119             0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1,
120             0x1f8d5d08,
121         };
122 
123         /*
124          * Multiprecision multiplication of this nature is more
125          * readily done in integers than in VFP, since we can use
126          * UMULL (on CPUs that support it) to multiply 32 by 32 bits
127          * at a time whereas the VFP would only be able to do 12x12
128          * without losing accuracy.
129          *
130          * So extract the mantissa of the input number as two 32-bit
131          * integers.
132          */
133         unsigned mant1 = 0x00100000 | (__HI(x) & 0xFFFFF);
134         unsigned mant2 = __LO(x);
135         /*
136          * Now work out which part of our stored value of 2/pi we're
137          * supposed to be multiplying by.
138          *
139          * Let the IEEE exponent field of x be e. With its bias
140          * removed, (e-1023) is the index of the set bit in bit 20
141          * of 'mant1' (i.e. that set bit has real place value
142          * 2^(e-1023)). So the lowest set bit in 'mant1', 52 bits
143          * further down, must have place value 2^(e-1075).
144          *
145          * We begin taking an interest in the value of 2/pi at the
146          * bit which multiplies by _that_ to give something with
147          * place value at most 2. In other words, the highest bit of
148          * 2/pi we're interested in is the one with place value
149          * 2/(2^(e-1075)) = 2^(1076-e).
150          *
151          * The bit at the top of the first zero word of the above
152          * array has place value 2^63. Hence, the bit we want to put
153          * at the top of the first word we extract from that array
154          * is the one at bit index n, where 63-n = 1076-e and hence
155          * n=e-1013.
156          */
157         int topbitindex = ((__HI(x) >> 20) & 0x7FF) - 1013;
158         int wordindex = topbitindex >> 5;
159         int shiftup = topbitindex & 31;
160         int shiftdown = 32 - shiftup;
161         unsigned scaled[8];
162         int i;
163 
164         scaled[7] = scaled[6] = 0;
165 
166         for (i = 6; i-- > 0 ;) {
167             /*
168              * Extract a word from our representation of 2/pi.
169              */
170             unsigned word;
171             if (shiftup)
172                 word = (twooverpi[wordindex + i] << shiftup) | (twooverpi[wordindex + i + 1] >> shiftdown);
173             else
174                 word = twooverpi[wordindex + i];
175             /*
176              * Multiply it by both words of our mantissa. (Should
177              * generate UMULLs where available.)
178              */
179             unsigned long long mult1 = (unsigned long long)word * mant1;
180             unsigned long long mult2 = (unsigned long long)word * mant2;
181             /*
182              * Split those up into 32-bit chunks.
183              */
184             unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32);
185             unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32);
186             /*
187              * Add those two chunks together.
188              */
189             unsigned out1, out2, out3;
190 
191             out3 = bottom2;
192             out2 = top2 + bottom1;
193             out1 = top1 + (out2 < top2);
194             /*
195              * And finally add them to our 'scaled' array.
196              */
197             unsigned s3 = scaled[i+2], s2 = scaled[i+1], s1;
198             unsigned carry;
199             s3 = out3 + s3; carry = (s3 < out3);
200             s2 = out2 + s2 + carry; carry = carry ? (s2 <= out2) : (s2 < out2);
201             s1 = out1 + carry;
202 
203             scaled[i+2] = s3;
204             scaled[i+1] = s2;
205             scaled[i] = s1;
206         }
207 
208 
209         /*
210          * The topmost set bit in mant1 is bit 20, and that has
211          * place value 2^(e-1023). The topmost bit (bit 31) of the
212          * most significant word we extracted from our twooverpi
213          * array had place value 2^(1076-e). So the product of those
214          * two bits must have place value 2^53; and that bit will
215          * have ended up as bit 19 of scaled[0]. Hence, the integer
216          * quadrant value we want to output, consisting of the bits
217          * with place values 2^1 and 2^0, must be 52 and 53 bits
218          * below that, i.e. precisely the topmost two bits of
219          * scaled[2].
220          *
221          * Or, at least, it will be once we add 1/2, to round to the
222          * _nearest_ multiple of pi/2 rather than the next one down.
223          */
224         q = (scaled[2] + (1<<29)) >> 30;
225 
226         /*
227          * Now we construct the output fraction, which is most
228          * simply done in the VFP. We just extract four consecutive
229          * bit strings from our chunk of binary data, convert them
230          * to doubles, equip each with an appropriate FP exponent,
231          * add them together, and (don't forget) multiply back up by
232          * pi/2. That way we don't have to work out ourselves where
233          * the highest fraction bit ended up.
234          *
235          * Since our displacement from the nearest multiple of pi/2
236          * can be positive or negative, the topmost of these four
237          * values must be arranged with its 2^-1 bit at the very top
238          * of the word, and then treated as a _signed_ integer.
239          */
240         int i1 = (scaled[2] << 2);
241         unsigned i2 = scaled[3];
242         unsigned i3 = scaled[4];
243         unsigned i4 = scaled[5];
244         double f1 = i1, f2 = i2 * (0x1.0p-30), f3 = i3 * (0x1.0p-62), f4 = i4 * (0x1.0p-94);
245 
246         /*
247          * Now f1+f2+f3+f4 is a representation, potentially to
248          * twice double precision, of 2^32 times ((2/pi)*x minus
249          * some integer). So our remaining job is to multiply
250          * back down by (pi/2)*2^-32, and convert back to one
251          * single-precision output number.
252          */
253         double ftop = f1 + (f2 + (f3 + f4));
254         ftop = __SET_LO(ftop, 0);
255         double fbot = f4 - (((ftop-f1)-f2)-f3);
256 
257         /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */
258         double ret = (ftop * 0x1.921fb54p-32) + (ftop * 0x1.10b4611a62633p-62 + fbot * 0x1.921fb54442d18p-32);
259 
260         /* Just before we return, take the input sign into account. */
261         if (__HI(x) & 0x80000000) {
262             q = -q;
263             ret = -ret;
264         }
265         y[0] = ret;
266         return q;
267     }
268 }
269