1 2 /*---------------------------------------------------------------*/ 3 /*--- begin host_generic_maddf.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 Compute x * y + z as ternary operation. 8 Copyright (C) 2010-2013 Free Software Foundation, Inc. 9 This file is part of the GNU C Library. 10 Contributed by Jakub Jelinek <jakub@redhat.com>, 2010. 11 12 The GNU C Library is free software; you can redistribute it and/or 13 modify it under the terms of the GNU Lesser General Public 14 License as published by the Free Software Foundation; either 15 version 2.1 of the License, or (at your option) any later version. 16 17 The GNU C Library is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 Lesser General Public License for more details. 21 22 You should have received a copy of the GNU Lesser General Public 23 License along with the GNU C Library; if not, see 24 <http://www.gnu.org/licenses/>. 25 */ 26 27 /* Generic helper functions for doing FMA, i.e. compute x * y + z 28 as ternary operation. 29 These are purely back-end entities and cannot be seen/referenced 30 from IR. */ 31 32 #include "libvex_basictypes.h" 33 #include "host_generic_maddf.h" 34 #include "main_util.h" 35 36 /* This implementation relies on Double being more than twice as 37 precise as Float and uses rounding to odd in order to avoid problems 38 with double rounding. 39 See a paper by Boldo and Melquiond: 40 http://www.lri.fr/~melquion/doc/08-tc.pdf */ 41 42 #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X)) 43 44 #if defined(__x86_64__) && defined(__SSE2_MATH__) 45 # define ENV_TYPE unsigned int 46 /* Save current rounding mode into ENV, hold exceptions, set rounding 47 mode to rounding toward zero. */ 48 # define ROUNDTOZERO(env) \ 49 do { \ 50 unsigned int mxcsr; \ 51 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ 52 (env) = mxcsr; \ 53 mxcsr = (mxcsr | 0x7f80) & ~0x3f; \ 54 __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ 55 } while (0) 56 /* Restore exceptions from ENV, return if inexact exception has been raised 57 since ROUNDTOZERO. */ 58 # define RESET_TESTINEXACT(env) \ 59 ({ \ 60 unsigned int mxcsr, ret; \ 61 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ 62 ret = (mxcsr >> 5) & 1; \ 63 mxcsr = (mxcsr & 0x3d) | (env); \ 64 __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ 65 ret; \ 66 }) 67 /* Return if inexact exception has been raised since ROUNDTOZERO. */ 68 # define TESTINEXACT() \ 69 ({ \ 70 unsigned int mxcsr; \ 71 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ 72 (mxcsr >> 5) & 1; \ 73 }) 74 #endif 75 76 #define DBL_MANT_DIG 53 77 #define IEEE754_DOUBLE_BIAS 0x3ff 78 79 union vg_ieee754_double { 80 Double d; 81 82 /* This is the IEEE 754 double-precision format. */ 83 struct { 84 #ifdef VKI_BIG_ENDIAN 85 unsigned int negative:1; 86 unsigned int exponent:11; 87 unsigned int mantissa0:20; 88 unsigned int mantissa1:32; 89 #else 90 unsigned int mantissa1:32; 91 unsigned int mantissa0:20; 92 unsigned int exponent:11; 93 unsigned int negative:1; 94 #endif 95 } ieee; 96 }; 97 98 void VEX_REGPARM(3) h_generic_calc_MAddF32(Float * res,Float * argX,Float * argY,Float * argZ)99 h_generic_calc_MAddF32 ( /*OUT*/Float* res, 100 Float* argX, Float* argY, Float* argZ ) 101 { 102 #ifndef ENV_TYPE 103 /* Lame fallback implementation. */ 104 *res = *argX * *argY + *argZ; 105 #else 106 ENV_TYPE env; 107 /* Multiplication is always exact. */ 108 Double temp = (Double) *argX * (Double) *argY; 109 union vg_ieee754_double u; 110 111 ROUNDTOZERO (env); 112 113 /* Perform addition with round to odd. */ 114 u.d = temp + (Double) *argZ; 115 /* Ensure the addition is not scheduled after fetestexcept call. */ 116 FORCE_EVAL (u.d); 117 118 /* Reset rounding mode and test for inexact simultaneously. */ 119 int j = RESET_TESTINEXACT (env); 120 121 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) 122 u.ieee.mantissa1 |= j; 123 124 /* And finally truncation with round to nearest. */ 125 *res = (Float) u.d; 126 #endif 127 } 128 129 130 void VEX_REGPARM(3) h_generic_calc_MAddF64(Double * res,Double * argX,Double * argY,Double * argZ)131 h_generic_calc_MAddF64 ( /*OUT*/Double* res, 132 Double* argX, Double* argY, Double* argZ ) 133 { 134 #ifndef ENV_TYPE 135 /* Lame fallback implementation. */ 136 *res = *argX * *argY + *argZ; 137 #else 138 Double x = *argX, y = *argY, z = *argZ; 139 union vg_ieee754_double u, v, w; 140 int adjust = 0; 141 u.d = x; 142 v.d = y; 143 w.d = z; 144 if (UNLIKELY (u.ieee.exponent + v.ieee.exponent 145 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) 146 || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) 147 || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) 148 || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) 149 || UNLIKELY (u.ieee.exponent + v.ieee.exponent 150 <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) { 151 /* If z is Inf, but x and y are finite, the result should be 152 z rather than NaN. */ 153 if (w.ieee.exponent == 0x7ff 154 && u.ieee.exponent != 0x7ff 155 && v.ieee.exponent != 0x7ff) { 156 *res = (z + x) + y; 157 return; 158 } 159 /* If x or y or z is Inf/NaN, or if fma will certainly overflow, 160 or if x * y is less than half of DBL_DENORM_MIN, 161 compute as x * y + z. */ 162 if (u.ieee.exponent == 0x7ff 163 || v.ieee.exponent == 0x7ff 164 || w.ieee.exponent == 0x7ff 165 || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS 166 || u.ieee.exponent + v.ieee.exponent 167 < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) { 168 *res = x * y + z; 169 return; 170 } 171 if (u.ieee.exponent + v.ieee.exponent 172 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) { 173 /* Compute 1p-53 times smaller result and multiply 174 at the end. */ 175 if (u.ieee.exponent > v.ieee.exponent) 176 u.ieee.exponent -= DBL_MANT_DIG; 177 else 178 v.ieee.exponent -= DBL_MANT_DIG; 179 /* If x + y exponent is very large and z exponent is very small, 180 it doesn't matter if we don't adjust it. */ 181 if (w.ieee.exponent > DBL_MANT_DIG) 182 w.ieee.exponent -= DBL_MANT_DIG; 183 adjust = 1; 184 } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { 185 /* Similarly. 186 If z exponent is very large and x and y exponents are 187 very small, it doesn't matter if we don't adjust it. */ 188 if (u.ieee.exponent > v.ieee.exponent) { 189 if (u.ieee.exponent > DBL_MANT_DIG) 190 u.ieee.exponent -= DBL_MANT_DIG; 191 } else if (v.ieee.exponent > DBL_MANT_DIG) 192 v.ieee.exponent -= DBL_MANT_DIG; 193 w.ieee.exponent -= DBL_MANT_DIG; 194 adjust = 1; 195 } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { 196 u.ieee.exponent -= DBL_MANT_DIG; 197 if (v.ieee.exponent) 198 v.ieee.exponent += DBL_MANT_DIG; 199 else 200 v.d *= 0x1p53; 201 } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { 202 v.ieee.exponent -= DBL_MANT_DIG; 203 if (u.ieee.exponent) 204 u.ieee.exponent += DBL_MANT_DIG; 205 else 206 u.d *= 0x1p53; 207 } else /* if (u.ieee.exponent + v.ieee.exponent 208 <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ { 209 if (u.ieee.exponent > v.ieee.exponent) 210 u.ieee.exponent += 2 * DBL_MANT_DIG; 211 else 212 v.ieee.exponent += 2 * DBL_MANT_DIG; 213 if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) { 214 if (w.ieee.exponent) 215 w.ieee.exponent += 2 * DBL_MANT_DIG; 216 else 217 w.d *= 0x1p106; 218 adjust = -1; 219 } 220 /* Otherwise x * y should just affect inexact 221 and nothing else. */ 222 } 223 x = u.d; 224 y = v.d; 225 z = w.d; 226 } 227 /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */ 228 # define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1) 229 Double x1 = x * C; 230 Double y1 = y * C; 231 Double m1 = x * y; 232 x1 = (x - x1) + x1; 233 y1 = (y - y1) + y1; 234 Double x2 = x - x1; 235 Double y2 = y - y1; 236 Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2; 237 # undef C 238 239 /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */ 240 Double a1 = z + m1; 241 Double t1 = a1 - z; 242 Double t2 = a1 - t1; 243 t1 = m1 - t1; 244 t2 = z - t2; 245 Double a2 = t1 + t2; 246 247 ENV_TYPE env; 248 ROUNDTOZERO (env); 249 250 /* Perform m2 + a2 addition with round to odd. */ 251 u.d = a2 + m2; 252 253 if (UNLIKELY (adjust < 0)) { 254 if ((u.ieee.mantissa1 & 1) == 0) 255 u.ieee.mantissa1 |= TESTINEXACT (); 256 v.d = a1 + u.d; 257 /* Ensure the addition is not scheduled after fetestexcept call. */ 258 FORCE_EVAL (v.d); 259 } 260 261 /* Reset rounding mode and test for inexact simultaneously. */ 262 int j = RESET_TESTINEXACT (env) != 0; 263 264 if (LIKELY (adjust == 0)) { 265 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) 266 u.ieee.mantissa1 |= j; 267 /* Result is a1 + u.d. */ 268 *res = a1 + u.d; 269 } else if (LIKELY (adjust > 0)) { 270 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) 271 u.ieee.mantissa1 |= j; 272 /* Result is a1 + u.d, scaled up. */ 273 *res = (a1 + u.d) * 0x1p53; 274 } else { 275 /* If a1 + u.d is exact, the only rounding happens during 276 scaling down. */ 277 if (j == 0) { 278 *res = v.d * 0x1p-106; 279 return; 280 } 281 /* If result rounded to zero is not subnormal, no double 282 rounding will occur. */ 283 if (v.ieee.exponent > 106) { 284 *res = (a1 + u.d) * 0x1p-106; 285 return; 286 } 287 /* If v.d * 0x1p-106 with round to zero is a subnormal above 288 or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa 289 down just by 1 bit, which means v.ieee.mantissa1 |= j would 290 change the round bit, not sticky or guard bit. 291 v.d * 0x1p-106 never normalizes by shifting up, 292 so round bit plus sticky bit should be already enough 293 for proper rounding. */ 294 if (v.ieee.exponent == 106) { 295 /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding, 296 v.ieee.mantissa1 & 1 is the round bit and j is our sticky 297 bit. In round-to-nearest 001 rounds down like 00, 298 011 rounds up, even though 01 rounds down (thus we need 299 to adjust), 101 rounds down like 10 and 111 rounds up 300 like 11. */ 301 if ((v.ieee.mantissa1 & 3) == 1) { 302 v.d *= 0x1p-106; 303 if (v.ieee.negative) 304 *res = v.d - 0x1p-1074; 305 else 306 *res = v.d + 0x1p-1074; 307 } else 308 *res = v.d * 0x1p-106; 309 return; 310 } 311 v.ieee.mantissa1 |= j; 312 *res = v.d * 0x1p-106; 313 return; 314 } 315 #endif 316 } 317 318 /*---------------------------------------------------------------*/ 319 /*--- end host_generic_maddf.c --*/ 320 /*---------------------------------------------------------------*/ 321