1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                              host_generic_maddf.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    Compute x * y + z as ternary operation.
8    Copyright (C) 2010-2013 Free Software Foundation, Inc.
9    This file is part of the GNU C Library.
10    Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
11 
12    The GNU C Library is free software; you can redistribute it and/or
13    modify it under the terms of the GNU Lesser General Public
14    License as published by the Free Software Foundation; either
15    version 2.1 of the License, or (at your option) any later version.
16 
17    The GNU C Library is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20    Lesser General Public License for more details.
21 
22    You should have received a copy of the GNU Lesser General Public
23    License along with the GNU C Library; if not, see
24    <http://www.gnu.org/licenses/>.
25 */
26 
27 /* Generic helper functions for doing FMA, i.e. compute x * y + z
28    as ternary operation.
29    These are purely back-end entities and cannot be seen/referenced
30    from IR. */
31 
32 #include "libvex_basictypes.h"
33 #include "host_generic_maddf.h"
34 #include "main_util.h"
35 
36 /* This implementation relies on Double being more than twice as
37    precise as Float and uses rounding to odd in order to avoid problems
38    with double rounding.
39    See a paper by Boldo and Melquiond:
40    http://www.lri.fr/~melquion/doc/08-tc.pdf  */
41 
42 #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
43 
44 #if defined(__x86_64__) && defined(__SSE2_MATH__)
45 # define ENV_TYPE unsigned int
46 /* Save current rounding mode into ENV, hold exceptions, set rounding
47    mode to rounding toward zero.  */
48 # define ROUNDTOZERO(env) \
49    do {							\
50       unsigned int mxcsr;				\
51       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
52       (env) = mxcsr;					\
53       mxcsr = (mxcsr | 0x7f80) & ~0x3f;			\
54       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
55    } while (0)
56 /* Restore exceptions from ENV, return if inexact exception has been raised
57    since ROUNDTOZERO.  */
58 # define RESET_TESTINEXACT(env) \
59    ({							\
60       unsigned int mxcsr, ret;				\
61       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
62       ret = (mxcsr >> 5) & 1;				\
63       mxcsr = (mxcsr & 0x3d) | (env);			\
64       __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
65       ret;						\
66    })
67 /* Return if inexact exception has been raised since ROUNDTOZERO.  */
68 # define TESTINEXACT() \
69    ({							\
70       unsigned int mxcsr;				\
71       __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
72       (mxcsr >> 5) & 1;					\
73    })
74 #endif
75 
76 #define DBL_MANT_DIG 53
77 #define IEEE754_DOUBLE_BIAS 0x3ff
78 
79 union vg_ieee754_double {
80    Double d;
81 
82    /* This is the IEEE 754 double-precision format.  */
83    struct {
84 #ifdef VKI_BIG_ENDIAN
85       unsigned int negative:1;
86       unsigned int exponent:11;
87       unsigned int mantissa0:20;
88       unsigned int mantissa1:32;
89 #else
90       unsigned int mantissa1:32;
91       unsigned int mantissa0:20;
92       unsigned int exponent:11;
93       unsigned int negative:1;
94 #endif
95    } ieee;
96 };
97 
98 void VEX_REGPARM(3)
h_generic_calc_MAddF32(Float * res,Float * argX,Float * argY,Float * argZ)99      h_generic_calc_MAddF32 ( /*OUT*/Float* res,
100                                Float* argX, Float* argY, Float* argZ )
101 {
102 #ifndef ENV_TYPE
103    /* Lame fallback implementation.  */
104    *res = *argX * *argY + *argZ;
105 #else
106    ENV_TYPE env;
107    /* Multiplication is always exact.  */
108    Double temp = (Double) *argX * (Double) *argY;
109    union vg_ieee754_double u;
110 
111    ROUNDTOZERO (env);
112 
113    /* Perform addition with round to odd.  */
114    u.d = temp + (Double) *argZ;
115    /* Ensure the addition is not scheduled after fetestexcept call.  */
116    FORCE_EVAL (u.d);
117 
118    /* Reset rounding mode and test for inexact simultaneously.  */
119    int j = RESET_TESTINEXACT (env);
120 
121    if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
122       u.ieee.mantissa1 |= j;
123 
124    /* And finally truncation with round to nearest.  */
125    *res = (Float) u.d;
126 #endif
127 }
128 
129 
130 void VEX_REGPARM(3)
h_generic_calc_MAddF64(Double * res,Double * argX,Double * argY,Double * argZ)131      h_generic_calc_MAddF64 ( /*OUT*/Double* res,
132                                Double* argX, Double* argY, Double* argZ )
133 {
134 #ifndef ENV_TYPE
135    /* Lame fallback implementation.  */
136    *res = *argX * *argY + *argZ;
137 #else
138    Double x = *argX, y = *argY, z = *argZ;
139    union vg_ieee754_double u, v, w;
140    int adjust = 0;
141    u.d = x;
142    v.d = y;
143    w.d = z;
144    if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
145                  >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
146        || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
147        || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
148        || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
149        || UNLIKELY (u.ieee.exponent + v.ieee.exponent
150                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
151       /* If z is Inf, but x and y are finite, the result should be
152          z rather than NaN.  */
153       if (w.ieee.exponent == 0x7ff
154           && u.ieee.exponent != 0x7ff
155           && v.ieee.exponent != 0x7ff) {
156          *res = (z + x) + y;
157          return;
158       }
159       /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
160          or if x * y is less than half of DBL_DENORM_MIN,
161          compute as x * y + z.  */
162       if (u.ieee.exponent == 0x7ff
163           || v.ieee.exponent == 0x7ff
164           || w.ieee.exponent == 0x7ff
165           || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
166           || u.ieee.exponent + v.ieee.exponent
167              < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
168          *res = x * y + z;
169          return;
170       }
171       if (u.ieee.exponent + v.ieee.exponent
172           >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
173          /* Compute 1p-53 times smaller result and multiply
174             at the end.  */
175          if (u.ieee.exponent > v.ieee.exponent)
176             u.ieee.exponent -= DBL_MANT_DIG;
177          else
178             v.ieee.exponent -= DBL_MANT_DIG;
179          /* If x + y exponent is very large and z exponent is very small,
180             it doesn't matter if we don't adjust it.  */
181          if (w.ieee.exponent > DBL_MANT_DIG)
182             w.ieee.exponent -= DBL_MANT_DIG;
183          adjust = 1;
184       } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
185          /* Similarly.
186             If z exponent is very large and x and y exponents are
187             very small, it doesn't matter if we don't adjust it.  */
188          if (u.ieee.exponent > v.ieee.exponent) {
189             if (u.ieee.exponent > DBL_MANT_DIG)
190                u.ieee.exponent -= DBL_MANT_DIG;
191          } else if (v.ieee.exponent > DBL_MANT_DIG)
192             v.ieee.exponent -= DBL_MANT_DIG;
193          w.ieee.exponent -= DBL_MANT_DIG;
194          adjust = 1;
195       } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
196          u.ieee.exponent -= DBL_MANT_DIG;
197          if (v.ieee.exponent)
198             v.ieee.exponent += DBL_MANT_DIG;
199          else
200             v.d *= 0x1p53;
201       } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
202          v.ieee.exponent -= DBL_MANT_DIG;
203          if (u.ieee.exponent)
204             u.ieee.exponent += DBL_MANT_DIG;
205          else
206             u.d *= 0x1p53;
207       } else /* if (u.ieee.exponent + v.ieee.exponent
208                     <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
209          if (u.ieee.exponent > v.ieee.exponent)
210             u.ieee.exponent += 2 * DBL_MANT_DIG;
211          else
212             v.ieee.exponent += 2 * DBL_MANT_DIG;
213          if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
214             if (w.ieee.exponent)
215                w.ieee.exponent += 2 * DBL_MANT_DIG;
216             else
217                w.d *= 0x1p106;
218             adjust = -1;
219          }
220          /* Otherwise x * y should just affect inexact
221             and nothing else.  */
222       }
223       x = u.d;
224       y = v.d;
225       z = w.d;
226    }
227    /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
228 #  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
229    Double x1 = x * C;
230    Double y1 = y * C;
231    Double m1 = x * y;
232    x1 = (x - x1) + x1;
233    y1 = (y - y1) + y1;
234    Double x2 = x - x1;
235    Double y2 = y - y1;
236    Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
237 #  undef C
238 
239    /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
240    Double a1 = z + m1;
241    Double t1 = a1 - z;
242    Double t2 = a1 - t1;
243    t1 = m1 - t1;
244    t2 = z - t2;
245    Double a2 = t1 + t2;
246 
247    ENV_TYPE env;
248    ROUNDTOZERO (env);
249 
250    /* Perform m2 + a2 addition with round to odd.  */
251    u.d = a2 + m2;
252 
253    if (UNLIKELY (adjust < 0)) {
254       if ((u.ieee.mantissa1 & 1) == 0)
255          u.ieee.mantissa1 |= TESTINEXACT ();
256       v.d = a1 + u.d;
257       /* Ensure the addition is not scheduled after fetestexcept call.  */
258       FORCE_EVAL (v.d);
259    }
260 
261    /* Reset rounding mode and test for inexact simultaneously.  */
262    int j = RESET_TESTINEXACT (env) != 0;
263 
264    if (LIKELY (adjust == 0)) {
265       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
266          u.ieee.mantissa1 |= j;
267       /* Result is a1 + u.d.  */
268       *res = a1 + u.d;
269    } else if (LIKELY (adjust > 0)) {
270       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
271          u.ieee.mantissa1 |= j;
272       /* Result is a1 + u.d, scaled up.  */
273       *res = (a1 + u.d) * 0x1p53;
274    } else {
275       /* If a1 + u.d is exact, the only rounding happens during
276          scaling down.  */
277       if (j == 0) {
278          *res = v.d * 0x1p-106;
279          return;
280       }
281       /* If result rounded to zero is not subnormal, no double
282          rounding will occur.  */
283       if (v.ieee.exponent > 106) {
284          *res = (a1 + u.d) * 0x1p-106;
285          return;
286       }
287       /* If v.d * 0x1p-106 with round to zero is a subnormal above
288          or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
289          down just by 1 bit, which means v.ieee.mantissa1 |= j would
290          change the round bit, not sticky or guard bit.
291          v.d * 0x1p-106 never normalizes by shifting up,
292          so round bit plus sticky bit should be already enough
293          for proper rounding.  */
294       if (v.ieee.exponent == 106) {
295          /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
296             v.ieee.mantissa1 & 1 is the round bit and j is our sticky
297             bit.  In round-to-nearest 001 rounds down like 00,
298             011 rounds up, even though 01 rounds down (thus we need
299             to adjust), 101 rounds down like 10 and 111 rounds up
300             like 11.  */
301          if ((v.ieee.mantissa1 & 3) == 1) {
302             v.d *= 0x1p-106;
303             if (v.ieee.negative)
304                *res = v.d - 0x1p-1074;
305             else
306                *res = v.d + 0x1p-1074;
307          } else
308             *res = v.d * 0x1p-106;
309          return;
310       }
311       v.ieee.mantissa1 |= j;
312       *res = v.d * 0x1p-106;
313       return;
314     }
315 #endif
316 }
317 
318 /*---------------------------------------------------------------*/
319 /*--- end                                 host_generic_maddf.c --*/
320 /*---------------------------------------------------------------*/
321