/external/XNNPACK/src/f32-raddextexp/ |
D | avx2-p5.c.in | 132 const __m256 vmax_acce${ABC[A:A+2]} = _mm256_max_ps(vacce${A}, vacce${A+1}); 134 const __m256 vmax_acce${ABC[A]} = vacce${A}; 139 …const __m256 vmax_acce${ABC[A:min(A+ACC_SLICE*2, ACCUMULATORS)]} = _mm256_max_ps(vmax_acce${ABC[A:… 143 …const __m256 vdelta_acce${K} = _mm256_max_ps(_mm256_sub_ps(vacce${K}, vmax_acce${ABC[0:ACCUMULATOR… 151 __m256 vacce = vmax_acce${ABC[0:ACCUMULATORS]}; 239 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); 240 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … 241 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … 242 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); 251 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce));
|
D | avx512f-p5-scalef.c.in | 114 const __m512 vmax_acce${ABC[A:A+2]} = _mm512_max_ps(vacce${A}, vacce${A+1}); 116 const __m512 vmax_acce${ABC[A]} = vacce${A}; 121 …const __m512 vmax_acce${ABC[A:min(A+ACC_SLICE*2, ACCUMULATORS)]} = _mm512_max_ps(vmax_acce${ABC[A:… 125 const __m512 vdelta_acce${K} = _mm512_sub_ps(vacce${K}, vmax_acce${ABC[0:ACCUMULATORS]}); 130 __m512 vacce = vmax_acce${ABC[0:ACCUMULATORS]}; 197 const float vmax_acce = _mm512_reduce_max_ps(vacce); 198 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); 201 sum[1] = vmax_acce;
|
/external/XNNPACK/src/f32-raddextexp/gen/ |
D | avx2-p5-x64.c | 283 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() local 284 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 285 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 286 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64() 295 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64()
|
D | avx2-p5-x72.c | 296 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() local 297 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 298 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 299 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72() 308 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72()
|
D | avx2-p5-x64-acc2.c | 298 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() local 299 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 300 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 301 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2() 310 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2()
|
D | avx2-p5-x64-acc4.c | 318 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() local 319 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 320 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 321 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4() 330 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4()
|
D | avx2-p5-x80-acc2.c | 324 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() local 325 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 326 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 327 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2() 336 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2()
|
D | avx2-p5-x80.c | 309 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() local 310 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 311 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 312 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80() 321 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80()
|
D | avx2-p5-x72-acc3.c | 322 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() local 323 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 324 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 325 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3() 334 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3()
|
D | avx2-p5-x96.c | 335 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() local 336 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 337 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 338 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96() 347 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96()
|
D | avx2-p5-x96-acc3.c | 361 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() local 362 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 363 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 364 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3() 373 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3()
|
D | avx2-p5-x96-acc2.c | 350 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() local 351 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 352 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 353 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2() 362 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2()
|
D | avx2-p5-x80-acc5.c | 355 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() local 356 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 357 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 358 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5() 367 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5()
|
D | avx2-p5-x96-acc6.c | 390 __m256 vmax_acce = _mm256_max_ps(vacce, _mm256_permute2f128_ps(vacce, vacce, 1)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() local 391 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(1, 0, 3, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 392 …vmax_acce = _mm256_max_ps(vmax_acce, _mm256_shuffle_ps(vmax_acce, vmax_acce, _MM_SHUFFLE(2, 3, 0, … in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 393 const __m256 vdelta_acce = _mm256_max_ps(_mm256_sub_ps(vacce, vmax_acce), vmin_exponent); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6() 402 _mm_store_ss(&sum[1], _mm256_castps256_ps128(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6()
|
D | avx512f-p5-scalef-x128.c | 239 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() local 240 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128() 243 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128()
|
D | avx512f-p5-scalef-x128-acc2.c | 250 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() local 251 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2() 254 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2()
|
D | avx512f-p5-scalef-x144.c | 251 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() local 252 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144() 255 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144()
|
D | avx512f-p5-scalef-x128-acc4.c | 266 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() local 267 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4() 270 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4()
|
D | avx512f-p5-scalef-x160-acc2.c | 274 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() local 275 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2() 278 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2()
|
D | avx512f-p5-scalef-x160.c | 263 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() local 264 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160() 267 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160()
|
D | avx512f-p5-scalef-x144-acc3.c | 271 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() local 272 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3() 275 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3()
|
D | avx512f-p5-scalef-x192-acc2.c | 298 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() local 299 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2() 302 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2()
|
D | avx512f-p5-scalef-x160-acc5.c | 299 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() local 300 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5() 303 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5()
|
D | avx512f-p5-scalef-x192.c | 287 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() local 288 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192() 291 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192()
|
D | avx512f-p5-scalef-x192-acc3.c | 307 const float vmax_acce = _mm512_reduce_max_ps(vacce); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() local 308 const __m512 vdelta_acce = _mm512_sub_ps(vacce, _mm512_set1_ps(vmax_acce)); in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3() 311 sum[1] = vmax_acce; in xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3()
|