1 /*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 #ifndef __IMMINTRIN_H
24 #error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
25 #endif
26 
27 #ifndef __AVX512ERINTRIN_H
28 #define __AVX512ERINTRIN_H
29 
30 
31 // exp2a23
32 #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
33   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
34                                       (__v8df)_mm512_setzero_pd(), \
35                                       (__mmask8)-1, (R)); })
36 
37 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
38   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
39                                       (__v8df)(__m512d)(S), \
40                                       (__mmask8)(M), (R)); })
41 
42 #define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
43   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
44                                       (__v8df)_mm512_setzero_pd(), \
45                                       (__mmask8)(M), (R)); })
46 
47 #define _mm512_exp2a23_pd(A) \
48    _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
49 
50 #define _mm512_mask_exp2a23_pd(S, M, A) \
51    _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
52 
53 #define _mm512_maskz_exp2a23_pd(M, A) \
54    _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
55 
56 #define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
57   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
58                                      (__v16sf)_mm512_setzero_ps(), \
59                                      (__mmask8)-1, (R)); })
60 
61 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
62   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
63                                      (__v16sf)(__m512)(S), \
64                                      (__mmask8)(M), (R)); })
65 
66 #define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
67   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
68                                      (__v16sf)_mm512_setzero_ps(), \
69                                      (__mmask8)(M), (R)); })
70 
71 #define _mm512_exp2a23_ps(A) \
72    _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
73 
74 #define _mm512_mask_exp2a23_ps(S, M, A) \
75    _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
76 
77 #define _mm512_maskz_exp2a23_ps(M, A) \
78    _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
79 
80 // rsqrt28
81 #define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
82   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
83                                          (__v8df)_mm512_setzero_pd(), \
84                                          (__mmask8)-1, (R)); })
85 
86 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
87   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
88                                          (__v8df)(__m512d)(S), \
89                                          (__mmask8)(M), (R)); })
90 
91 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
92   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
93                                          (__v8df)_mm512_setzero_pd(), \
94                                          (__mmask8)(M), (R)); })
95 
96 #define _mm512_rsqrt28_pd(A) \
97   _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
98 
99 #define _mm512_mask_rsqrt28_pd(S, M, A) \
100   _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
101 
102 #define _mm512_maskz_rsqrt28_pd(M, A) \
103   _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
104 
105 #define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
106   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
107                                         (__v16sf)_mm512_setzero_ps(), \
108                                         (__mmask16)-1, (R)); })
109 
110 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
111   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
112                                         (__v16sf)(__m512)(S), \
113                                         (__mmask16)(M), (R)); })
114 
115 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
116   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
117                                         (__v16sf)_mm512_setzero_ps(), \
118                                         (__mmask16)(M), (R)); })
119 
120 #define _mm512_rsqrt28_ps(A) \
121   _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
122 
123 #define _mm512_mask_rsqrt28_ps(S, M, A) \
124   _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
125 
126 #define _mm512_maskz_rsqrt28_ps(M, A) \
127   _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
128 
129 #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
130   (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
131                                         (__v4sf)(__m128)(B), \
132                                         (__v4sf)_mm_setzero_ps(), \
133                                         (__mmask8)-1, (R)); })
134 
135 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
136   (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
137                                         (__v4sf)(__m128)(B), \
138                                         (__v4sf)(__m128)(S), \
139                                         (__mmask8)(M), (R)); })
140 
141 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
142   (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
143                                         (__v4sf)(__m128)(B), \
144                                         (__v4sf)_mm_setzero_ps(), \
145                                         (__mmask8)(M), (R)); })
146 
147 #define _mm_rsqrt28_ss(A, B) \
148   _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
149 
150 #define _mm_mask_rsqrt28_ss(S, M, A, B) \
151   _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
152 
153 #define _mm_maskz_rsqrt28_ss(M, A, B) \
154   _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
155 
156 #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
157   (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
158                                          (__v2df)(__m128d)(B), \
159                                          (__v2df)_mm_setzero_pd(), \
160                                          (__mmask8)-1, (R)); })
161 
162 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
163   (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
164                                          (__v2df)(__m128d)(B), \
165                                          (__v2df)(__m128d)(S), \
166                                          (__mmask8)(M), (R)); })
167 
168 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
169   (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
170                                          (__v2df)(__m128d)(B), \
171                                          (__v2df)_mm_setzero_pd(), \
172                                          (__mmask8)(M), (R)); })
173 
174 #define _mm_rsqrt28_sd(A, B) \
175   _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
176 
177 #define _mm_mask_rsqrt28_sd(S, M, A, B) \
178   _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
179 
180 #define _mm_maskz_rsqrt28_sd(M, A, B) \
181   _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
182 
183 // rcp28
184 #define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
185   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
186                                        (__v8df)_mm512_setzero_pd(), \
187                                        (__mmask8)-1, (R)); })
188 
189 #define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
190   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
191                                        (__v8df)(__m512d)(S), \
192                                        (__mmask8)(M), (R)); })
193 
194 #define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
195   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
196                                        (__v8df)_mm512_setzero_pd(), \
197                                        (__mmask8)(M), (R)); })
198 
199 #define _mm512_rcp28_pd(A) \
200   _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
201 
202 #define _mm512_mask_rcp28_pd(S, M, A) \
203   _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
204 
205 #define _mm512_maskz_rcp28_pd(M, A) \
206   _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
207 
208 #define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
209   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
210                                       (__v16sf)_mm512_setzero_ps(), \
211                                       (__mmask16)-1, (R)); })
212 
213 #define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
214   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
215                                       (__v16sf)(__m512)(S), \
216                                       (__mmask16)(M), (R)); })
217 
218 #define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
219   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
220                                       (__v16sf)_mm512_setzero_ps(), \
221                                       (__mmask16)(M), (R)); })
222 
223 #define _mm512_rcp28_ps(A) \
224   _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
225 
226 #define _mm512_mask_rcp28_ps(S, M, A) \
227   _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
228 
229 #define _mm512_maskz_rcp28_ps(M, A) \
230   _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
231 
232 #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
233   (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
234                                       (__v4sf)(__m128)(B), \
235                                       (__v4sf)_mm_setzero_ps(), \
236                                       (__mmask8)-1, (R)); })
237 
238 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
239   (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
240                                       (__v4sf)(__m128)(B), \
241                                       (__v4sf)(__m128)(S), \
242                                       (__mmask8)(M), (R)); })
243 
244 #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
245   (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
246                                       (__v4sf)(__m128)(B), \
247                                       (__v4sf)_mm_setzero_ps(), \
248                                       (__mmask8)(M), (R)); })
249 
250 #define _mm_rcp28_ss(A, B) \
251   _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
252 
253 #define _mm_mask_rcp28_ss(S, M, A, B) \
254   _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
255 
256 #define _mm_maskz_rcp28_ss(M, A, B) \
257   _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
258 
259 #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
260   (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
261                                        (__v2df)(__m128d)(B), \
262                                        (__v2df)_mm_setzero_pd(), \
263                                        (__mmask8)-1, (R)); })
264 
265 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
266   (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
267                                        (__v2df)(__m128d)(B), \
268                                        (__v2df)(__m128d)(S), \
269                                        (__mmask8)(M), (R)); })
270 
271 #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
272   (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
273                                        (__v2df)(__m128d)(B), \
274                                        (__v2df)_mm_setzero_pd(), \
275                                        (__mmask8)(M), (R)); })
276 
277 #define _mm_rcp28_sd(A, B) \
278   _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
279 
280 #define _mm_mask_rcp28_sd(S, M, A, B) \
281   _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
282 
283 #define _mm_maskz_rcp28_sd(M, A, B) \
284   _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
285 
286 #endif // __AVX512ERINTRIN_H
287