/external/XNNPACK/src/f32-raddextexp/ |
D | avx512f-p5-scalef.c.in | 98 const __m512 vdelta_e${N} = _mm512_sub_ps(vn${N}, vmax_e${N % ACCUMULATORS}); 104 … % ACCUMULATORS} = _mm512_add_ps(vaccv${N % ACCUMULATORS}, _mm512_scalef_ps(vp${N}, vdelta_e${N})); 158 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); 160 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); 190 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); 192 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e));
|
D | avx2-p5.c.in | 106 …const __m256 vdelta_e${N} = _mm256_max_ps(_mm256_sub_ps(vn${N}, vmax_e${N % ACCUMULATORS}), vmin_e… 116 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e${N}, vmagic_bias)… 181 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); 185 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… 224 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); 228 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2…
|
/external/XNNPACK/src/f32-raddextexp/gen/ |
D | avx512f-p5-scalef-x128.c | 200 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local 202 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 232 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local 234 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
|
D | avx512f-p5-scalef-x128-acc2.c | 211 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local 213 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 243 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local 245 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
|
D | avx512f-p5-scalef-x144.c | 212 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local 214 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 244 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local 246 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
|
D | avx512f-p5-scalef-x128-acc4.c | 227 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local 229 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 259 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local 261 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
|
D | avx512f-p5-scalef-x160-acc2.c | 235 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local 237 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 267 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local 269 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
|
D | avx512f-p5-scalef-x160.c | 224 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local 226 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 256 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local 258 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
|
D | avx512f-p5-scalef-x144-acc3.c | 232 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local 234 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 264 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local 266 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
|
D | avx2-p5-x64.c | 225 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local 229 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 268 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local 272 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
|
D | avx512f-p5-scalef-x192-acc2.c | 259 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local 261 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 291 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local 293 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
|
D | avx2-p5-x72.c | 238 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local 242 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 281 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local 285 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
|
D | avx512f-p5-scalef-x160-acc5.c | 260 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local 262 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 292 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local 294 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
|
D | avx2-p5-x64-acc2.c | 240 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local 244 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 283 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local 287 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
|
D | avx512f-p5-scalef-x192.c | 248 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local 250 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 280 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local 282 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
|
D | avx2-p5-x64-acc4.c | 260 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local 264 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 303 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local 307 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
|
D | avx512f-p5-scalef-x192-acc3.c | 268 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local 270 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 300 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local 302 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
|
D | avx2-p5-x80-acc2.c | 266 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local 270 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 309 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local 313 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
|
D | avx512f-p5-scalef-x192-acc6.c | 291 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() local 293 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() 323 const __m512 vdelta_e = _mm512_sub_ps(vn, vmax_e); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() local 325 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
|
D | avx2-p5-x80.c | 251 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local 255 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 294 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local 298 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
|
D | avx2-p5-x72-acc3.c | 264 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local 268 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 307 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local 311 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
|
D | avx2-p5-x96.c | 277 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() local 281 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 320 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() local 324 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96()
|
D | avx2-p5-x96-acc3.c | 303 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local 307 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 346 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local 350 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
|
D | avx2-p5-x96-acc2.c | 292 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local 296 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 335 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local 339 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
|
D | avx2-p5-x80-acc5.c | 297 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local 301 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 340 const __m256 vdelta_e = _mm256_max_ps(_mm256_sub_ps(vn, vmax_e), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local 344 …_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(_mm256_add_ps(vdelta_e, vmagic_bias)), 2… in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
|