Home
last modified time | relevance | path

Searched refs:vaccv (Results 1 – 25 of 26) sorted by relevance

12

/external/XNNPACK/src/f32-raddextexp/
Davx512f-p5-scalef.c.in41 __m512 vaccv${K} = _mm512_setzero_ps();
102 vaccv${K} = _mm512_scalef_ps(vaccv${K}, vdelta_acce${K});
104vaccv${N % ACCUMULATORS} = _mm512_add_ps(vaccv${N % ACCUMULATORS}, _mm512_scalef_ps(vp${N}, vdelta…
127 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0);
129 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv${K}, vdelta_acce${K}));
132 __m512 vaccv = vaccv0;
159 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce);
160 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e));
191 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce);
192 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e));
[all …]
Davx2-p5.c.in45 __m256 vaccv${K} = _mm256_setzero_ps();
120 vaccv${K} = _mm256_mul_ps(vaccv${K}, vaccs${K});
122 vaccv${N % ACCUMULATORS} = _mm256_fmadd_ps(vp${N}, vs${N}, vaccv${N % ACCUMULATORS});
148 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0);
150 vaccv = _mm256_fmadd_ps(vaccv${K}, vaccs${K}, vaccv);
153 __m256 vaccv = vaccv0;
188 vaccv = _mm256_mul_ps(vaccv, vaccs);
189 vaccv = _mm256_fmadd_ps(vp, vs, vaccv);
232 vaccv = _mm256_mul_ps(vaccv, vaccs);
233 vaccv = _mm256_fmadd_ps(vp, vs, vaccv);
[all …]
/external/XNNPACK/src/f32-raddextexp/gen/
Davx512f-p5-scalef-x192-acc6.c260 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6() local
261 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
262 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
263 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
264 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv4, vdelta_acce4)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
265 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv5, vdelta_acce5)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
292 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
293 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
324 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
325 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6()
[all …]
Davx512f-p5-scalef-x128-acc4.c198 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local
199 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
200 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
201 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
228 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
229 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
260 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
261 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
269 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
Davx512f-p5-scalef-x160-acc5.c230 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local
231 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
232 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
233 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv3, vdelta_acce3)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
234 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv4, vdelta_acce4)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
261 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
262 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
293 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
294 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
302 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
Davx2-p5-x64-acc4.c229 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local
230 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
231 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
232 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
267 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
268 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
311 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
312 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
324 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
325 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
Davx512f-p5-scalef-x144-acc3.c204 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local
205 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
206 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
233 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
234 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
265 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
266 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
274 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
Davx512f-p5-scalef-x128-acc2.c184 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local
185 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
212 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
213 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
244 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
245 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
253 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
Davx2-p5-x96-acc6.c299 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() local
300 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
301 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
302 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
303 vaccv = _mm256_fmadd_ps(vaccv4, vaccs4, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
304 vaccv = _mm256_fmadd_ps(vaccv5, vaccs5, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
339 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
340 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
383 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
384 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
[all …]
Davx2-p5-x80-acc5.c265 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local
266 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
267 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
268 vaccv = _mm256_fmadd_ps(vaccv3, vaccs3, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
269 vaccv = _mm256_fmadd_ps(vaccv4, vaccs4, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
304 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
305 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
348 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
349 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
361 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
[all …]
Davx2-p5-x72-acc3.c234 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local
235 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
236 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
271 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
272 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
315 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
316 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
328 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
329 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
Davx512f-p5-scalef-x160-acc2.c208 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local
209 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
236 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
237 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
268 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
269 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
277 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
Davx512f-p5-scalef-x192-acc3.c240 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local
241 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
242 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv2, vdelta_acce2)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
269 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
270 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
301 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
302 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
310 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
Davx512f-p5-scalef-x128.c174 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local
201 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
202 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
233 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
234 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
242 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
Davx2-p5-x64-acc2.c211 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local
212 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
247 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
248 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
291 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
292 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
304 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
305 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
Davx2-p5-x80-acc2.c237 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local
238 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
273 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
274 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
317 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
318 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
330 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
331 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
Davx512f-p5-scalef-x144.c186 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local
213 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
214 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
245 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
246 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
254 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
Davx512f-p5-scalef-x192-acc2.c232 __m512 vaccv = _mm512_scalef_ps(vaccv0, vdelta_acce0); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local
233 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vaccv1, vdelta_acce1)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
260 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
261 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
292 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
293 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
301 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
Davx2-p5-x64.c197 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local
232 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
233 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
276 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
277 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
289 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
290 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
Davx2-p5-x96-acc3.c273 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local
274 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
275 vaccv = _mm256_fmadd_ps(vaccv2, vaccs2, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
310 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
311 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
354 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
355 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
367 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
368 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
Davx512f-p5-scalef-x160.c198 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local
225 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
226 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
257 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
258 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
266 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
Davx2-p5-x72.c210 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local
245 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
246 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
289 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
290 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
302 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
303 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
Davx2-p5-x80.c223 __m256 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local
258 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
259 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
302 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
303 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
315 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
316 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
Davx512f-p5-scalef-x192.c222 __m512 vaccv = vaccv0; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local
249 vaccv = _mm512_scalef_ps(vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
250 vaccv = _mm512_add_ps(vaccv, _mm512_scalef_ps(vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
281 vaccv = _mm512_mask_scalef_ps(vaccv, vmask, vaccv, vdelta_acce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
282 vaccv = _mm512_mask_add_ps(vaccv, vmask, vaccv, _mm512_maskz_scalef_ps(vmask, vp, vdelta_e)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
290 sum[0] = _mm512_reduce_add_ps(_mm512_scalef_ps(vaccv, vdelta_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
Davx2-p5-x96-acc2.c263 __m256 vaccv = _mm256_mul_ps(vaccv0, vaccs0); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local
264 vaccv = _mm256_fmadd_ps(vaccv1, vaccs1, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
299 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
300 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
343 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
344 vaccv = _mm256_fmadd_ps(vp, vs, vaccv); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
356 vaccv = _mm256_mul_ps(vaccv, vaccs); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
357 __m128 vaccv_sum = _mm_add_ps(_mm256_castps256_ps128(vaccv), _mm256_extractf128_ps(vaccv, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()

12