1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-vscaleextexp/avx512f-p5-scalef.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <immintrin.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/vscaleextexp.h>
17 
18 
xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x32(size_t elements,const float * x,float * y,float scale_value,float scale_exp)19 void xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x32(
20     size_t elements,
21     const float* x,
22     float* y,
23     float scale_value,
24     float scale_exp)
25 {
26   assert(elements % sizeof(float) == 0);
27 
28   const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f);
29   const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E43p-1f);
30   const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05C61p-29f);
31 
32   const __m512 vc0 = _mm512_set1_ps(1.0f);
33   const __m512 vc1 = _mm512_set1_ps(0x1.FFFFF6p-1f);
34   const __m512 vc2 = _mm512_set1_ps(0x1.FFFDC6p-2f);
35   const __m512 vc3 = _mm512_set1_ps(0x1.555A80p-3f);
36   const __m512 vc4 = _mm512_set1_ps(0x1.573A1Ap-5f);
37   const __m512 vc5 = _mm512_set1_ps(0x1.0F9F9Cp-7f);
38 
39   const __m512 vscalev = _mm512_set1_ps(scale_value);
40   const __m512 vscalee = _mm512_set1_ps(scale_exp);
41 
42   for (; elements >= 32 * sizeof(float); elements -= 32 * sizeof(float)) {
43     // Load 32 (2x16) inputs at a time.
44     const __m512 vx0 = _mm512_loadu_ps(x);
45     const __m512 vx1 = _mm512_loadu_ps(x + 16);
46     x += 32;
47 
48     // Compute reduced argument elements := round(x / log(2)).
49     const __m512 vn0 = _mm512_roundscale_ps(_mm512_mul_ps(vx0, vlog2e), 0);
50     const __m512 vn1 = _mm512_roundscale_ps(_mm512_mul_ps(vx1, vlog2e), 0);
51 
52     // Compute reduced argument t := x - elements * log(2).
53     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
54     __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_hi, vx0);
55     __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2_hi, vx1);
56 
57     vt0 = _mm512_fmadd_ps(vn0, vminus_ln2_lo, vt0);
58     vt1 = _mm512_fmadd_ps(vn1, vminus_ln2_lo, vt1);
59 
60     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
61     __m512 vp0 = _mm512_fmadd_ps(vc5, vt0, vc4);
62     __m512 vp1 = _mm512_fmadd_ps(vc5, vt1, vc4);
63 
64     vp0 = _mm512_fmadd_ps(vp0, vt0, vc3);
65     vp1 = _mm512_fmadd_ps(vp1, vt1, vc3);
66 
67     vp0 = _mm512_fmadd_ps(vp0, vt0, vc2);
68     vp1 = _mm512_fmadd_ps(vp1, vt1, vc2);
69 
70     vp0 = _mm512_fmadd_ps(vp0, vt0, vc1);
71     vp1 = _mm512_fmadd_ps(vp1, vt1, vc1);
72 
73     vp0 = _mm512_fmadd_ps(vp0, vt0, vc0);
74     vp1 = _mm512_fmadd_ps(vp1, vt1, vc0);
75 
76     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation where
77     //  - vnX is "exponent"
78     //  - vpX is "mantissa"
79     //
80     // exp2(ae) * av * exp2(be) * bv =
81     //   = exp2(ae + be) * (av * bv)
82     __m512 vf0 = _mm512_mul_ps(vp0, vscalev);
83     __m512 vf1 = _mm512_mul_ps(vp1, vscalev);
84 
85     const __m512 ve0 = _mm512_add_ps(vn0, vscalee);
86     const __m512 ve1 = _mm512_add_ps(vn1, vscalee);
87 
88     // Multiply "mantissa" by the exp2("exponent").
89     vf0 = _mm512_scalef_ps(vf0, ve0);
90     vf1 = _mm512_scalef_ps(vf1, ve1);
91 
92     // Store 128 (8x16) results at a time.
93     _mm512_storeu_ps(y, vf0);
94     _mm512_storeu_ps(y + 0, vf0);
95     _mm512_storeu_ps(y + 16, vf1);
96     y += 32;
97   }
98 
99   for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
100     // Load 16 inputs at a time.
101     const __m512 vx = _mm512_loadu_ps(x);
102     x += 16;
103 
104     // Compute reduced argument elements := round(x / log(2)).
105     const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0);
106 
107     // Compute reduced argument t := x - elements * log(2).
108     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
109     __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
110     vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
111 
112     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
113     __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4);
114     vp = _mm512_fmadd_ps(vp, vt, vc3);
115     vp = _mm512_fmadd_ps(vp, vt, vc2);
116     vp = _mm512_fmadd_ps(vp, vt, vc1);
117     vp = _mm512_fmadd_ps(vp, vt, vc0);
118 
119     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation.
120     __m512 vf = _mm512_mul_ps(vp, vscalev);
121     const __m512 ve = _mm512_add_ps(vn, vscalee);
122 
123     // Multiply "mantissa" by the exp2("exponent").
124     vf = _mm512_scalef_ps(vf, ve);
125 
126     // Store 16 results at a time.
127     _mm512_storeu_ps(y, vf);
128     y += 16;
129   }
130   if XNN_UNLIKELY(elements != 0) {
131     // Prepare mask for valid 32-bit elements (depends on elements).
132     elements >>= 2 /* log2(sizeof(float)) */;
133     const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << elements) - UINT32_C(1)));
134 
135     // Load up to 15 inputs at a time.
136     const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
137 
138     // Compute reduced argument elements := round(x / log(2)).
139     const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0);
140 
141     // Compute reduced argument t := x - elements * log(2).
142     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
143     __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
144     vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
145 
146     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
147     __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4);
148     vp = _mm512_fmadd_ps(vp, vt, vc3);
149     vp = _mm512_fmadd_ps(vp, vt, vc2);
150     vp = _mm512_fmadd_ps(vp, vt, vc1);
151     vp = _mm512_fmadd_ps(vp, vt, vc0);
152 
153     // Multiply "extended" floating-point numbers in ("mantissa", "exponent") representation.
154     __m512 vf = _mm512_mul_ps(vp, vscalev);
155     const __m512 ve = _mm512_add_ps(vn, vscalee);
156 
157     // Multiply "mantissa" by the exp2("exponent").
158     vf = _mm512_scalef_ps(vf, ve);
159 
160     // Store up to 15 results at a time.
161     _mm512_mask_storeu_ps(y, vmask, vf);
162   }
163   _mm256_zeroupper();
164 }
165