1 /*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 #ifndef __IMMINTRIN_H 24 #error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead." 25 #endif 26 27 #ifndef __AVX512ERINTRIN_H 28 #define __AVX512ERINTRIN_H 29 30 31 // exp2a23 32 #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ 33 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 34 (__v8df)_mm512_setzero_pd(), \ 35 (__mmask8)-1, (R)); }) 36 37 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ 38 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 39 (__v8df)(__m512d)(S), \ 40 (__mmask8)(M), (R)); }) 41 42 #define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ 43 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 44 (__v8df)_mm512_setzero_pd(), \ 45 (__mmask8)(M), (R)); }) 46 47 #define _mm512_exp2a23_pd(A) \ 48 _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) 49 50 #define _mm512_mask_exp2a23_pd(S, M, A) \ 51 _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 52 53 #define _mm512_maskz_exp2a23_pd(M, A) \ 54 _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 55 56 #define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ 57 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 58 (__v16sf)_mm512_setzero_ps(), \ 59 (__mmask8)-1, (R)); }) 60 61 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ 62 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 63 (__v16sf)(__m512)(S), \ 64 (__mmask8)(M), (R)); }) 65 66 #define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ 67 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 68 (__v16sf)_mm512_setzero_ps(), \ 69 (__mmask8)(M), (R)); }) 70 71 #define _mm512_exp2a23_ps(A) \ 72 _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) 73 74 #define _mm512_mask_exp2a23_ps(S, M, A) \ 75 _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 76 77 #define _mm512_maskz_exp2a23_ps(M, A) \ 78 _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 79 80 // rsqrt28 81 #define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ 82 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 83 (__v8df)_mm512_setzero_pd(), \ 84 (__mmask8)-1, (R)); }) 85 86 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ 87 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 88 (__v8df)(__m512d)(S), \ 89 (__mmask8)(M), (R)); }) 90 91 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ 92 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 93 (__v8df)_mm512_setzero_pd(), \ 94 (__mmask8)(M), (R)); }) 95 96 #define _mm512_rsqrt28_pd(A) \ 97 _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 98 99 #define _mm512_mask_rsqrt28_pd(S, M, A) \ 100 _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 101 102 #define _mm512_maskz_rsqrt28_pd(M, A) \ 103 _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 104 105 #define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ 106 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 107 (__v16sf)_mm512_setzero_ps(), \ 108 (__mmask16)-1, (R)); }) 109 110 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ 111 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 112 (__v16sf)(__m512)(S), \ 113 (__mmask16)(M), (R)); }) 114 115 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ 116 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 117 (__v16sf)_mm512_setzero_ps(), \ 118 (__mmask16)(M), (R)); }) 119 120 #define _mm512_rsqrt28_ps(A) \ 121 _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 122 123 #define _mm512_mask_rsqrt28_ps(S, M, A) \ 124 _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) 125 126 #define _mm512_maskz_rsqrt28_ps(M, A) \ 127 _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 128 129 #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ 130 (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ 131 (__v4sf)(__m128)(B), \ 132 (__v4sf)_mm_setzero_ps(), \ 133 (__mmask8)-1, (R)); }) 134 135 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ 136 (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ 137 (__v4sf)(__m128)(B), \ 138 (__v4sf)(__m128)(S), \ 139 (__mmask8)(M), (R)); }) 140 141 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ 142 (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ 143 (__v4sf)(__m128)(B), \ 144 (__v4sf)_mm_setzero_ps(), \ 145 (__mmask8)(M), (R)); }) 146 147 #define _mm_rsqrt28_ss(A, B) \ 148 _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 149 150 #define _mm_mask_rsqrt28_ss(S, M, A, B) \ 151 _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 152 153 #define _mm_maskz_rsqrt28_ss(M, A, B) \ 154 _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 155 156 #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ 157 (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ 158 (__v2df)(__m128d)(B), \ 159 (__v2df)_mm_setzero_pd(), \ 160 (__mmask8)-1, (R)); }) 161 162 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ 163 (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ 164 (__v2df)(__m128d)(B), \ 165 (__v2df)(__m128d)(S), \ 166 (__mmask8)(M), (R)); }) 167 168 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ 169 (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ 170 (__v2df)(__m128d)(B), \ 171 (__v2df)_mm_setzero_pd(), \ 172 (__mmask8)(M), (R)); }) 173 174 #define _mm_rsqrt28_sd(A, B) \ 175 _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 176 177 #define _mm_mask_rsqrt28_sd(S, M, A, B) \ 178 _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 179 180 #define _mm_maskz_rsqrt28_sd(M, A, B) \ 181 _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 182 183 // rcp28 184 #define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ 185 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 186 (__v8df)_mm512_setzero_pd(), \ 187 (__mmask8)-1, (R)); }) 188 189 #define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ 190 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 191 (__v8df)(__m512d)(S), \ 192 (__mmask8)(M), (R)); }) 193 194 #define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ 195 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 196 (__v8df)_mm512_setzero_pd(), \ 197 (__mmask8)(M), (R)); }) 198 199 #define _mm512_rcp28_pd(A) \ 200 _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 201 202 #define _mm512_mask_rcp28_pd(S, M, A) \ 203 _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 204 205 #define _mm512_maskz_rcp28_pd(M, A) \ 206 _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 207 208 #define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ 209 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 210 (__v16sf)_mm512_setzero_ps(), \ 211 (__mmask16)-1, (R)); }) 212 213 #define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ 214 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 215 (__v16sf)(__m512)(S), \ 216 (__mmask16)(M), (R)); }) 217 218 #define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ 219 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 220 (__v16sf)_mm512_setzero_ps(), \ 221 (__mmask16)(M), (R)); }) 222 223 #define _mm512_rcp28_ps(A) \ 224 _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 225 226 #define _mm512_mask_rcp28_ps(S, M, A) \ 227 _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 228 229 #define _mm512_maskz_rcp28_ps(M, A) \ 230 _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 231 232 #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ 233 (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ 234 (__v4sf)(__m128)(B), \ 235 (__v4sf)_mm_setzero_ps(), \ 236 (__mmask8)-1, (R)); }) 237 238 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ 239 (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ 240 (__v4sf)(__m128)(B), \ 241 (__v4sf)(__m128)(S), \ 242 (__mmask8)(M), (R)); }) 243 244 #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ 245 (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ 246 (__v4sf)(__m128)(B), \ 247 (__v4sf)_mm_setzero_ps(), \ 248 (__mmask8)(M), (R)); }) 249 250 #define _mm_rcp28_ss(A, B) \ 251 _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 252 253 #define _mm_mask_rcp28_ss(S, M, A, B) \ 254 _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 255 256 #define _mm_maskz_rcp28_ss(M, A, B) \ 257 _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 258 259 #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ 260 (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ 261 (__v2df)(__m128d)(B), \ 262 (__v2df)_mm_setzero_pd(), \ 263 (__mmask8)-1, (R)); }) 264 265 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ 266 (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ 267 (__v2df)(__m128d)(B), \ 268 (__v2df)(__m128d)(S), \ 269 (__mmask8)(M), (R)); }) 270 271 #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ 272 (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ 273 (__v2df)(__m128d)(B), \ 274 (__v2df)_mm_setzero_pd(), \ 275 (__mmask8)(M), (R)); }) 276 277 #define _mm_rcp28_sd(A, B) \ 278 _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 279 280 #define _mm_mask_rcp28_sd(S, M, A, B) \ 281 _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 282 283 #define _mm_maskz_rcp28_sd(M, A, B) \ 284 _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 285 286 #endif // __AVX512ERINTRIN_H 287