/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x128.c | 37 const __m512 vx7 = _mm512_loadu_ps(x + 112); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local 47 const __m512 vrsqrtx7 = _mm512_rsqrt14_ps(vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 63 __m512 vsqrtx7 = _mm512_mul_ps(vrsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 99 const __m512 vadjustment7 = _mm512_fnmadd_ps(vsqrtx7, vsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
|
D | fma3-nr1fma1adj-x64.c | 38 const __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() local 48 const __m256 vrsqrtx7 = _mm256_rsqrt_ps(vx7); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 64 __m256 vsqrtx7 = _mm256_mul_ps(vrsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 100 const __m256 vadjustment7 = _mm256_fnmadd_ps(vsqrtx7, vsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
|
/external/XNNPACK/src/f32-velu/gen/ |
D | velu-avx2-rr1-lut16-p3-gather-x64.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() local 60 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 180 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 189 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64()
|
D | velu-avx2-rr1-p6-x64.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() local 60 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 182 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 191 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64()
|
D | velu-avx2-rr1-lut8-p4-perm-x64.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() local 60 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 180 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 189 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64()
|
D | velu-avx2-rr1-lut4-p4-perm-x64.c | 51 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() local 61 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() 181 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64() 190 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64()
|
D | velu-avx2-rr1-lut8-p4-perm-x72.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72() local 61 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72() 194 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72() 205 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72()
|
D | velu-avx2-rr1-lut4-p4-perm-x72.c | 51 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() local 62 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 195 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 206 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72()
|
D | velu-avx2-rr1-p6-x72.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() local 61 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 196 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 207 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72()
|
D | velu-avx2-rr1-lut16-p3-gather-x72.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() local 61 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 194 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 205 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72()
|
D | velu-avx2-rr1-lut4-p4-perm-x80.c | 51 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x80() local 63 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x80() 209 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x80() 222 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x80()
|
D | velu-avx2-rr1-lut16-p3-gather-x80.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x80() local 62 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x80() 208 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x80() 221 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x80()
|
D | velu-avx2-rr1-p6-x80.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_p6_x80() local 62 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x80() 210 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_p6_x80() 223 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_p6_x80()
|
D | velu-avx2-rr1-lut8-p4-perm-x80.c | 50 __m256 vx7 = _mm256_loadu_ps(x + 56); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x80() local 62 const __m256 vz7 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x80() 208 vx7 = _mm256_mul_ps(vx7, vbeta); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x80() 221 const __m256 vy7 = _mm256_blendv_ps(vx7, ve7, vx7); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x80()
|
D | velu-avx512f-rr1-lut16-p3-perm-x128.c | 50 __m512 vx7 = _mm512_loadu_ps(x + 112); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() local 60 const __m512 vz7 = _mm512_max_ps(vsat_cutoff, _mm512_mul_ps(vx7, vprescale)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() 174 const __mmask16 vsign7 = _mm512_cmp_ps_mask(vx7, vzero, _CMP_NLT_US); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() 183 vy7 = _mm512_mask_mul_ps(vy7, vsign7, vx7, vbeta); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128()
|
/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | avx2-p5-x64-acc2.c | 65 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() local 75 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 107 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 186 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2()
|
D | avx2-p5-x64.c | 64 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64() local 74 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64() 106 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64() 185 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64()
|
D | avx2-p5-x64-acc4.c | 67 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4() local 77 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4() 109 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4() 188 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4()
|
/external/XNNPACK/src/f32-vscaleexpminusmax/gen/ |
D | avx2-p5-x64.c | 65 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64() local 75 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64() 107 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64() 186 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64()
|
D | avx2-p5-x72.c | 66 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72() local 77 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72() 112 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72() 199 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72()
|
/external/XNNPACK/src/f32-raddexpminusmax/gen/ |
D | avx2-p5-x64-acc4.c | 66 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4() local 76 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4() 108 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4() 187 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4()
|
D | avx2-p5-x64.c | 63 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64() local 73 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64() 105 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64() 184 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64()
|
D | avx2-p5-x64-acc2.c | 64 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2() local 74 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2() 106 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2() 185 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2()
|
D | avx2-p5-x72-acc3.c | 66 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3() local 77 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3() 112 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3() 199 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3()
|
D | avx2-p5-x72.c | 64 const __m256 vx7 = _mm256_sub_ps(vi7, vi_max); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72() local 75 __m256 vn7 = _mm256_fmadd_ps(vx7, vlog2e, vmagic_bias); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72() 110 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2_hi, vx7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72() 197 vf7 = _mm256_andnot_ps(_mm256_cmp_ps(vx7, vdenorm_cutoff, _CMP_LT_OS), vf7); in xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72()
|