/external/XNNPACK/src/f32-raddextexp/ |
D | avx512f-p5-scalef.c.in | 43 __m512 vacce${K} = vminus_inf; 91 __m512 vmax_e${N} = _mm512_max_ps(vacce${N}, vn${N}); 96 const __m512 vdelta_acce${K} = _mm512_sub_ps(vacce${K}, vmax_e${K}); 107 vacce${K} = vmax_e${K}; 114 const __m512 vmax_acce${ABC[A:A+2]} = _mm512_max_ps(vacce${A}, vacce${A+1}); 116 const __m512 vmax_acce${ABC[A]} = vacce${A}; 125 const __m512 vdelta_acce${K} = _mm512_sub_ps(vacce${K}, vmax_acce${ABC[0:ACCUMULATORS]}); 130 __m512 vacce = vmax_acce${ABC[0:ACCUMULATORS]}; 133 __m512 vacce = vacce0; 156 const __m512 vmax_e = _mm512_max_ps(vacce, vn); [all …]
|
D | avx2-p5.c.in | 47 __m256 vacce${K} = vminus_inf; 95 __m256 vmax_e${N} = _mm256_max_ps(vacce${N}, vn${N}); 104 … const __m256 vdelta_acce${K} = _mm256_max_ps(_mm256_sub_ps(vacce${K}, vmax_e${K}), vmin_exponent); 125 vacce${K} = vmax_e${K}; 132 const __m256 vmax_acce${ABC[A:A+2]} = _mm256_max_ps(vacce${A}, vacce${A+1}); 134 const __m256 vmax_acce${ABC[A]} = vacce${A}; 143 …const __m256 vdelta_acce${K} = _mm256_max_ps(_mm256_sub_ps(vacce${K}, vmax_acce${ABC[0:ACCUMULATOR… 151 __m256 vacce = vmax_acce${ABC[0:ACCUMULATORS]}; 154 __m256 vacce = vacce0; 177 const __m256 vmax_e = _mm256_max_ps(vacce, vn); [all …]
|
/external/XNNPACK/src/f32-raddextexp/gen/ |
D | avx2-p5-x64.c | 198 __m256 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local 221 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 224 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 235 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 254 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 265 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 269 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 279 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 283 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 286 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
|
D | avx512f-p5-scalef-x128.c | 175 __m512 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local 198 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 199 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 204 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 230 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 231 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 235 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 239 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 240 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
|
D | avx512f-p5-scalef-x128-acc2.c | 186 __m512 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local 209 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 210 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 215 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 241 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 242 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 246 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 250 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 251 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
|
D | avx2-p5-x72.c | 211 __m256 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local 234 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 237 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 248 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 267 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 278 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 282 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 292 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 296 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 299 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
|
D | avx2-p5-x64-acc2.c | 213 __m256 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local 236 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 239 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 250 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 269 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 280 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 284 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 294 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 298 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 301 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
|
D | avx512f-p5-scalef-x144.c | 187 __m512 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local 210 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 211 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 216 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 242 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 243 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 247 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 251 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 252 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
|
D | avx2-p5-x64-acc4.c | 233 __m256 vacce = vmax_acce0123; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local 256 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 259 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 270 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 289 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 300 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 304 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 314 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 318 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 321 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
|
D | avx2-p5-x80-acc2.c | 239 __m256 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local 262 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 265 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 276 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 295 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 306 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 310 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 320 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 324 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 327 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
|
D | avx2-p5-x80.c | 224 __m256 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local 247 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 250 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 261 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 280 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 291 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 295 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 305 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 309 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 312 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
|
D | avx2-p5-x72-acc3.c | 237 __m256 vacce = vmax_acce012; in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local 260 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 263 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 274 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 293 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 304 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 308 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 318 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 322 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 325 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
|
D | avx512f-p5-scalef-x128-acc4.c | 202 __m512 vacce = vmax_acce0123; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local 225 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 226 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 231 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 257 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 258 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 262 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 266 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 267 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
|
D | avx512f-p5-scalef-x160-acc2.c | 210 __m512 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local 233 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 234 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 239 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 265 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 266 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 270 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 274 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 275 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
|
D | avx512f-p5-scalef-x160.c | 199 __m512 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local 222 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 223 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 228 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 254 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 255 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 259 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 263 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 264 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
|
D | avx512f-p5-scalef-x144-acc3.c | 207 __m512 vacce = vmax_acce012; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local 230 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 231 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 236 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 262 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 263 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 267 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 271 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 272 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
|
D | avx2-p5-x96.c | 250 __m256 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx2_p5_x96() local 273 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 276 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 287 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 306 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 317 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 321 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 331 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 335 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 338 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96()
|
D | avx512f-p5-scalef-x192-acc2.c | 234 __m512 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local 257 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 258 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 263 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 289 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 290 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 294 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 298 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 299 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
|
D | avx512f-p5-scalef-x160-acc5.c | 235 __m512 vacce = vmax_acce01234; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local 258 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 259 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 264 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 290 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 291 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 295 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 299 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 300 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
|
D | avx512f-p5-scalef-x192.c | 223 __m512 vacce = vacce0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local 246 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 247 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 252 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 278 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 279 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 283 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 287 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 288 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
|
D | avx2-p5-x96-acc3.c | 276 __m256 vacce = vmax_acce012; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local 299 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 302 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 313 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 332 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 343 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 347 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 357 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 361 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 364 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
|
D | avx2-p5-x96-acc2.c | 265 __m256 vacce = vmax_acce01; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local 288 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 291 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 302 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 321 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 332 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 336 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 346 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 350 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 353 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
|
D | avx2-p5-x80-acc5.c | 270 __m256 vacce = vmax_acce01234; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local 293 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 296 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 307 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 326 vn = _mm256_blendv_ps(vacce, vn, _mm256_castsi256_ps(vmask)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 337 const __m256 vmax_e = _mm256_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 341 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 351 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 355 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 358 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
|
D | avx512f-p5-scalef-x192-acc3.c | 243 __m512 vacce = vmax_acce012; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local 266 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 267 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 272 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 298 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 299 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 303 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 307 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 308 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
|
D | avx512f-p5-scalef-x192-acc6.c | 266 __m512 vacce = vmax_acce012345; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() local 289 const __m512 vmax_e = _mm512_max_ps(vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 290 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 295 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 321 const __m512 vmax_e = _mm512_mask_max_ps(vacce, vmask, vacce, vn); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 322 const __m512 vdelta_acce = _mm512_sub_ps(vacce, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 326 vacce = vmax_e; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 330 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 331 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
|