1 /*
2  * Single-precision vector powf function.
3  *
4  * Copyright (c) 2019, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "mathlib.h"
9 #include "v_math.h"
10 #if V_SUPPORTED
11 
12 #define Min v_u32 (0x00800000)
13 #define Max v_u32 (0x7f800000)
14 #define SBITS 5
15 #define Tlog v__powf_log2_data.tab
16 #define Texp v__exp2f_data.tab
17 #define A v__powf_log2_data.poly
18 #define C v__exp2f_data.poly
19 #define LOGDEG 4
20 
21 #if LOGDEG == 5
22 /* 1.01 ulp */
23 #define OFF v_u32 (0x3f330000)
24 #define TBITS 4
25 #elif LOGDEG == 4
26 /* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
27 #define OFF v_u32 (0x3f35d000)
28 #define TBITS 5
29 #endif
30 
31 #define V_EXP2F_TABLE_BITS SBITS
32 #define V_EXP2F_POLY_ORDER 3
33 struct v_exp2f_data
34 {
35   uint64_t tab[1 << V_EXP2F_TABLE_BITS];
36   double poly[V_EXP2F_POLY_ORDER];
37 };
38 
39 #define V_POWF_LOG2_TABLE_BITS TBITS
40 #define V_POWF_LOG2_POLY_ORDER LOGDEG
41 #define SCALE ((double) (1 << SBITS))
42 struct v_powf_log2_data
43 {
44   struct
45   {
46     double invc, logc;
47   } tab[1 << V_POWF_LOG2_TABLE_BITS];
48   double poly[V_POWF_LOG2_POLY_ORDER];
49 };
50 
51 static const struct v_powf_log2_data v__powf_log2_data = {
52 #if LOGDEG == 5
53   .tab = {
54 { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
55 { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
56 { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
57 { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
58 { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
59 { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
60 { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
61 { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
62 { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
63 { 0x1p+0, 0x0p+0 * SCALE },
64 { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
65 { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
66 { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
67 { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
68 { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
69 { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
70   },
71 /* rel err: 1.46 * 2^-32 */
72   .poly = {
73 0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
74 0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
75 0x1.71547652ab82bp0 * SCALE,
76   }
77 #elif LOGDEG == 4
78   .tab = {
79 {0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
80 {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
81 {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
82 {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
83 {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
84 {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
85 {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
86 {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
87 {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
88 {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
89 {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
90 {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
91 {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
92 {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
93 {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
94 {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
95 {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
96 {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
97 {0x1p+0, 0x0p+0 * SCALE},
98 {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
99 {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
100 {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
101 {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
102 {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
103 {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
104 {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
105 {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
106 {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
107 {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
108 {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
109 {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
110 {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
111   },
112 /* rel err: 1.5 * 2^-30 */
113   .poly = {
114  -0x1.6ff5daa3b3d7cp-2 * SCALE,
115  0x1.ec81d03c01aebp-2 * SCALE,
116  -0x1.71547bb43f101p-1 * SCALE,
117  0x1.7154764a815cbp0 * SCALE,
118   }
119 #endif
120 };
121 
122 static const struct v_exp2f_data v__exp2f_data = {
123   .tab = {
124 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
125 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
126 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
127 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
128 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
129 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
130 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
131 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
132   },
133 /* rel err: 1.69 * 2^-34 */
134   .poly = {
135 0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
136   },
137 };
138 
139 VPCS_ATTR
140 __attribute__ ((noinline)) static v_f32_t
141 specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
142 {
143   return v_call2_f32 (powf, x, y, ret, cmp);
144 }
145 
146 VPCS_ATTR
147 v_f32_t
148 V_NAME(powf) (v_f32_t x, v_f32_t y)
149 {
150   v_u32_t u, tmp, cmp, i, top, iz;
151   v_s32_t k;
152   v_f32_t ret;
153 
154   u = v_as_u32_f32 (x);
155   cmp = v_cond_u32 (u - Min >= Max - Min);
156   tmp = u - OFF;
157   i = (tmp >> (23 - TBITS)) % (1 << TBITS);
158   top = tmp & 0xff800000;
159   iz = u - top;
160   k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
161 
162   for (int lane = 0; lane < v_lanes32 (); lane++)
163     {
164       uint32_t si, siz;
165       int32_t sk;
166       float sy;
167 
168       /* Use double precision for each lane.  */
169       double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
170       uint64_t ki, t;
171 
172       si = v_get_u32 (i, lane);
173       siz = v_get_u32 (iz, lane);
174       sk = v_get_s32 (k, lane);
175       sy = v_get_f32 (y, lane);
176 
177       invc = Tlog[si].invc;
178       logc = Tlog[si].logc;
179       z = (double) as_f32_u32 (siz);
180 
181       /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
182       r = __builtin_fma (z, invc, -1.0);
183       y0 = logc + (double) sk;
184 
185       /* Polynomial to approximate log1p(r)/ln2.  */
186 #if LOGDEG == 5
187       logx = A[0];
188       logx = r * logx + A[1];
189       logx = r * logx + A[2];
190       logx = r * logx + A[3];
191       logx = r * logx + A[4];
192       logx = r * logx + y0;
193 #elif LOGDEG == 4
194       logx = A[0];
195       logx = r * logx + A[1];
196       logx = r * logx + A[2];
197       logx = r * logx + A[3];
198       logx = r * logx + y0;
199 #endif
200       ylogx = sy * logx;
201       v_set_u32 (&cmp, lane,
202 		 (as_u64_f64 (ylogx) >> 47 & 0xffff)
203 		     >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
204 		   ? 1
205 		   : v_get_u32 (cmp, lane));
206 
207       /* N*x = k + r with r in [-1/2, 1/2] */
208 #if TOINT_INTRINSICS
209       kd = roundtoint (ylogx); /* k */
210       ki = converttoint (ylogx);
211 #else
212 # define SHIFT 0x1.8p52
213       kd = eval_as_double (ylogx + SHIFT);
214       ki = asuint64 (kd);
215       kd -= SHIFT;
216 #endif
217       r = ylogx - kd;
218 
219       /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
220       t = Texp[ki % (1 << SBITS)];
221       t += ki << (52 - SBITS);
222       s = as_f64_u64 (t);
223       p = C[0];
224       p = __builtin_fma (p, r, C[1]);
225       p = __builtin_fma (p, r, C[2]);
226       p = __builtin_fma (p, s * r, s);
227 
228       v_set_f32 (&ret, lane, p);
229     }
230   if (unlikely (v_any_u32 (cmp)))
231     return specialcase (x, y, ret, cmp);
232   return ret;
233 }
234 VPCS_ALIAS
235 #endif
236