/external/libaom/libaom/av1/encoder/x86/ |
D | wedge_utils_avx2.c | 56 const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); in av1_wedge_sse_from_residuals_avx2() 57 const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); in av1_wedge_sse_from_residuals_avx2() 61 const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); in av1_wedge_sse_from_residuals_avx2() 116 const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); in av1_wedge_sign_from_residuals_avx2() 117 const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); in av1_wedge_sign_from_residuals_avx2() 118 const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); in av1_wedge_sign_from_residuals_avx2() 119 const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); in av1_wedge_sign_from_residuals_avx2() 191 const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); in av1_wedge_compute_delta_squares_avx2() 192 const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); in av1_wedge_compute_delta_squares_avx2() 193 const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); in av1_wedge_compute_delta_squares_avx2() [all …]
|
D | error_intrin_avx2.c | 47 const __m256i error_lo = _mm256_madd_epi16(diff, diff); in av1_block_error_lp_avx2() 60 const __m256i error = _mm256_madd_epi16(diff, diff); in av1_block_error_lp_avx2() 106 dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); in av1_block_error_avx2() 108 coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); in av1_block_error_avx2()
|
D | rdopt_avx2.c | 41 const __m256i madd_xy = _mm256_madd_epi16(pixels, slli); in horver_correlation_4x4() 49 const __m256i madd_xz = _mm256_madd_epi16(slli, perm); in horver_correlation_4x4() 54 const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1)); in horver_correlation_4x4() 58 const __m256i madd_slli = _mm256_madd_epi16(slli, slli); in horver_correlation_4x4()
|
/external/libaom/libaom/av1/common/x86/ |
D | highbd_wiener_convolve_avx2.c | 105 const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); in av1_highbd_wiener_convolve_add_src_avx2() 106 const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); in av1_highbd_wiener_convolve_add_src_avx2() 107 const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); in av1_highbd_wiener_convolve_add_src_avx2() 108 const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); in av1_highbd_wiener_convolve_add_src_avx2() 109 const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); in av1_highbd_wiener_convolve_add_src_avx2() 110 const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); in av1_highbd_wiener_convolve_add_src_avx2() 111 const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); in av1_highbd_wiener_convolve_add_src_avx2() 112 const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); in av1_highbd_wiener_convolve_add_src_avx2() 196 const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); in av1_highbd_wiener_convolve_add_src_avx2() 197 const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); in av1_highbd_wiener_convolve_add_src_avx2() [all …]
|
/external/libvpx/libvpx/vpx_dsp/x86/ |
D | fwd_dct32x32_impl_avx2.h | 365 const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 366 const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 367 const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 368 const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 369 const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 370 const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 371 const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 372 const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); in FDCT32x32_2D_AVX2() 373 const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); in FDCT32x32_2D_AVX2() 374 const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); in FDCT32x32_2D_AVX2() [all …]
|
/external/flac/src/libFLAC/ |
D | lpc_intrin_avx2.c | 80 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 81 …mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 82 …mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 83 …mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 84 …mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 85 …mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 86 …mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 87 …mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 88 …mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() 89 …mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_… in FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2() [all …]
|
/external/libvpx/libvpx/vp9/encoder/x86/ |
D | vp9_error_avx2.c | 37 dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); in vp9_block_error_avx2() 39 coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); in vp9_block_error_avx2() 66 dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); in vp9_block_error_avx2() 67 dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); in vp9_block_error_avx2() 69 coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); in vp9_block_error_avx2() 70 coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); in vp9_block_error_avx2() 125 const __m256i error_lo = _mm256_madd_epi16(diff, diff); in vp9_block_error_fp_avx2() 138 const __m256i error = _mm256_madd_epi16(diff, diff); in vp9_block_error_fp_avx2()
|
/external/libaom/libaom/aom_dsp/x86/ |
D | sum_squares_avx2.c | 38 const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); in aom_sum_squares_2d_i16_nxn_avx2() 39 const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); in aom_sum_squares_2d_i16_nxn_avx2() 40 const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); in aom_sum_squares_2d_i16_nxn_avx2() 41 const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); in aom_sum_squares_2d_i16_nxn_avx2() 132 __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); in aom_var_2d_u8_avx2() 133 __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); in aom_var_2d_u8_avx2() 153 __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); in aom_var_2d_u8_avx2() 154 __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); in aom_var_2d_u8_avx2() 206 __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); in aom_var_2d_u16_avx2() 225 __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); in aom_var_2d_u16_avx2()
|
D | blk_sse_sum_avx2.c | 68 row_sum_buffer = _mm256_madd_epi16(load_pixels, one); in sse_sum_wd4_avx2() 69 row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); in sse_sum_wd4_avx2() 104 row_sum_buffer = _mm256_madd_epi16(load_pixels, one); in sse_sum_wd8_avx2() 105 row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); in sse_sum_wd8_avx2() 137 row_sum_buffer = _mm256_madd_epi16(load_pixels, one); in sse_sum_wd16_avx2() 138 row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); in sse_sum_wd16_avx2()
|
D | sse_avx2.c | 31 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); in sse_w32_avx2() 32 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); in sse_w32_avx2() 86 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); in sse_w4x4_avx2() 97 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); in sse_w8x2_avx2() 141 _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), in aom_sse_avx2() 142 _mm256_madd_epi16(v_bsub, v_bsub)); in aom_sse_avx2() 222 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); in highbd_sse_w16_avx2() 241 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); in highbd_sse_w4x4_avx2() 250 *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); in highbd_sse_w8x2_avx2()
|
D | obmc_variance_avx2.c | 53 const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); in obmc_variance_w8n() 113 const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); in obmc_variance_w16n() 114 const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); in obmc_variance_w16n() 132 const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); in obmc_variance_w16n()
|
D | convolve_avx2.h | 344 const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); in convolve() 345 const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); in convolve() 346 const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); in convolve() 347 const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); in convolve() 357 const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); in convolve_4tap() 358 const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); in convolve_4tap() 411 const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); in comp_avg() 412 const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); in comp_avg()
|
D | obmc_sad_avx2.c | 49 const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); in obmc_sad_w4_avx2() 90 const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); in obmc_sad_w8n_avx2() 171 const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); in hbd_obmc_sad_w4_avx2() 216 const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); in hbd_obmc_sad_w8n_avx2()
|
D | highbd_variance_avx2.c | 36 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); in aom_highbd_calc8x8var_avx2() 66 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); in aom_highbd_calc16x16var_avx2() 72 __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); in aom_highbd_calc16x16var_avx2()
|
D | masked_sad_intrin_avx2.c | 228 __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); in highbd_masked_sad8xh_avx2() 234 __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); in highbd_masked_sad8xh_avx2() 245 res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); in highbd_masked_sad8xh_avx2() 285 __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); in highbd_masked_sad16xh_avx2() 291 __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); in highbd_masked_sad16xh_avx2() 302 res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); in highbd_masked_sad16xh_avx2()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c8-minmax-avx2.c | 109 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 110 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 111 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 115 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 116 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 117 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 121 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 122 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 123 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 127 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 2x8c8-minmax-avx2.c | 94 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 95 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 99 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 100 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 104 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 105 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 109 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 110 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
|
D | 1x8c8-minmax-avx2.c | 79 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 83 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 87 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 91 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-avx2.c | 94 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 95 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 96 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 100 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 101 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 102 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 106 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 107 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 108 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 112 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 3x8c8-xw-minmax-avx2.c | 93 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 94 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 95 vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 98 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 99 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 100 vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 103 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 104 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 105 vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 108 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 2x8c8-minmax-avx2.c | 81 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 82 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 86 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 87 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 91 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 92 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 96 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 97 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
|
D | 2x8c8-xw-minmax-avx2.c | 80 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 81 vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 84 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 85 vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 88 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 89 vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 92 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 93 vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
|
D | 1x8c8-minmax-avx2.c | 68 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 72 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 76 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 80 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
|
D | 1x8c8-xw-minmax-avx2.c | 67 vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01)); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 70 vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23)); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 73 vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45)); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 76 vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
|
/external/gemmlowp/internal/ |
D | pack_avx.h | 139 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 145 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 149 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 153 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 157 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 161 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 165 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack() 169 ymm7 = _mm256_madd_epi16(ymm6, one); in Pack()
|