1 /* 2 * Single-precision vector powf function. 3 * 4 * Copyright (c) 2019, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8 #include "mathlib.h" 9 #include "v_math.h" 10 #if V_SUPPORTED 11 12 #define Min v_u32 (0x00800000) 13 #define Max v_u32 (0x7f800000) 14 #define SBITS 5 15 #define Tlog v__powf_log2_data.tab 16 #define Texp v__exp2f_data.tab 17 #define A v__powf_log2_data.poly 18 #define C v__exp2f_data.poly 19 #define LOGDEG 4 20 21 #if LOGDEG == 5 22 /* 1.01 ulp */ 23 #define OFF v_u32 (0x3f330000) 24 #define TBITS 4 25 #elif LOGDEG == 4 26 /* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ 27 #define OFF v_u32 (0x3f35d000) 28 #define TBITS 5 29 #endif 30 31 #define V_EXP2F_TABLE_BITS SBITS 32 #define V_EXP2F_POLY_ORDER 3 33 struct v_exp2f_data 34 { 35 uint64_t tab[1 << V_EXP2F_TABLE_BITS]; 36 double poly[V_EXP2F_POLY_ORDER]; 37 }; 38 39 #define V_POWF_LOG2_TABLE_BITS TBITS 40 #define V_POWF_LOG2_POLY_ORDER LOGDEG 41 #define SCALE ((double) (1 << SBITS)) 42 struct v_powf_log2_data 43 { 44 struct 45 { 46 double invc, logc; 47 } tab[1 << V_POWF_LOG2_TABLE_BITS]; 48 double poly[V_POWF_LOG2_POLY_ORDER]; 49 }; 50 51 static const struct v_powf_log2_data v__powf_log2_data = { 52 #if LOGDEG == 5 53 .tab = { 54 { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, 55 { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, 56 { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, 57 { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, 58 { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, 59 { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, 60 { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, 61 { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, 62 { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, 63 { 0x1p+0, 0x0p+0 * SCALE }, 64 { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, 65 { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, 66 { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, 67 { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, 68 { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, 69 { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, 70 }, 71 /* rel err: 1.46 * 2^-32 */ 72 .poly = { 73 0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, 74 0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, 75 0x1.71547652ab82bp0 * SCALE, 76 } 77 #elif LOGDEG == 4 78 .tab = { 79 {0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, 80 {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, 81 {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, 82 {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, 83 {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, 84 {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, 85 {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, 86 {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, 87 {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, 88 {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, 89 {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, 90 {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, 91 {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, 92 {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, 93 {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, 94 {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, 95 {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, 96 {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, 97 {0x1p+0, 0x0p+0 * SCALE}, 98 {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, 99 {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, 100 {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, 101 {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, 102 {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, 103 {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, 104 {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, 105 {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, 106 {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, 107 {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, 108 {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, 109 {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, 110 {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, 111 }, 112 /* rel err: 1.5 * 2^-30 */ 113 .poly = { 114 -0x1.6ff5daa3b3d7cp-2 * SCALE, 115 0x1.ec81d03c01aebp-2 * SCALE, 116 -0x1.71547bb43f101p-1 * SCALE, 117 0x1.7154764a815cbp0 * SCALE, 118 } 119 #endif 120 }; 121 122 static const struct v_exp2f_data v__exp2f_data = { 123 .tab = { 124 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, 125 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 126 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, 127 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, 128 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, 129 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, 130 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, 131 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, 132 }, 133 /* rel err: 1.69 * 2^-34 */ 134 .poly = { 135 0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE 136 }, 137 }; 138 139 VPCS_ATTR 140 __attribute__ ((noinline)) static v_f32_t 141 specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) 142 { 143 return v_call2_f32 (powf, x, y, ret, cmp); 144 } 145 146 VPCS_ATTR 147 v_f32_t 148 V_NAME(powf) (v_f32_t x, v_f32_t y) 149 { 150 v_u32_t u, tmp, cmp, i, top, iz; 151 v_s32_t k; 152 v_f32_t ret; 153 154 u = v_as_u32_f32 (x); 155 cmp = v_cond_u32 (u - Min >= Max - Min); 156 tmp = u - OFF; 157 i = (tmp >> (23 - TBITS)) % (1 << TBITS); 158 top = tmp & 0xff800000; 159 iz = u - top; 160 k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ 161 162 for (int lane = 0; lane < v_lanes32 (); lane++) 163 { 164 uint32_t si, siz; 165 int32_t sk; 166 float sy; 167 168 /* Use double precision for each lane. */ 169 double invc, logc, z, r, p, y0, logx, ylogx, kd, s; 170 uint64_t ki, t; 171 172 si = v_get_u32 (i, lane); 173 siz = v_get_u32 (iz, lane); 174 sk = v_get_s32 (k, lane); 175 sy = v_get_f32 (y, lane); 176 177 invc = Tlog[si].invc; 178 logc = Tlog[si].logc; 179 z = (double) as_f32_u32 (siz); 180 181 /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ 182 r = __builtin_fma (z, invc, -1.0); 183 y0 = logc + (double) sk; 184 185 /* Polynomial to approximate log1p(r)/ln2. */ 186 #if LOGDEG == 5 187 logx = A[0]; 188 logx = r * logx + A[1]; 189 logx = r * logx + A[2]; 190 logx = r * logx + A[3]; 191 logx = r * logx + A[4]; 192 logx = r * logx + y0; 193 #elif LOGDEG == 4 194 logx = A[0]; 195 logx = r * logx + A[1]; 196 logx = r * logx + A[2]; 197 logx = r * logx + A[3]; 198 logx = r * logx + y0; 199 #endif 200 ylogx = sy * logx; 201 v_set_u32 (&cmp, lane, 202 (as_u64_f64 (ylogx) >> 47 & 0xffff) 203 >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 204 ? 1 205 : v_get_u32 (cmp, lane)); 206 207 /* N*x = k + r with r in [-1/2, 1/2] */ 208 #if TOINT_INTRINSICS 209 kd = roundtoint (ylogx); /* k */ 210 ki = converttoint (ylogx); 211 #else 212 # define SHIFT 0x1.8p52 213 kd = eval_as_double (ylogx + SHIFT); 214 ki = asuint64 (kd); 215 kd -= SHIFT; 216 #endif 217 r = ylogx - kd; 218 219 /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ 220 t = Texp[ki % (1 << SBITS)]; 221 t += ki << (52 - SBITS); 222 s = as_f64_u64 (t); 223 p = C[0]; 224 p = __builtin_fma (p, r, C[1]); 225 p = __builtin_fma (p, r, C[2]); 226 p = __builtin_fma (p, s * r, s); 227 228 v_set_f32 (&ret, lane, p); 229 } 230 if (unlikely (v_any_u32 (cmp))) 231 return specialcase (x, y, ret, cmp); 232 return ret; 233 } 234 VPCS_ALIAS 235 #endif 236