/external/libgav1/libgav1/src/dsp/arm/ |
D | intra_edge_neon.cc | 78 uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2)); in IntraEdgeFilter_NEON() local 79 sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]); in IntraEdgeFilter_NEON() 80 sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1); in IntraEdgeFilter_NEON() 86 vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); in IntraEdgeFilter_NEON() 102 uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2)); in IntraEdgeFilter_NEON() local 103 sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]); in IntraEdgeFilter_NEON() 104 sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1); in IntraEdgeFilter_NEON() 110 vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); in IntraEdgeFilter_NEON() 158 uint16x8_t sum_lo = in IntraEdgeFilter_NEON() local 162 sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2)); in IntraEdgeFilter_NEON() [all …]
|
D | convolve_neon.cc | 425 int32x4_t sum_lo, sum_hi; in SimpleSum2DVerticalTaps() local 427 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0); in SimpleSum2DVerticalTaps() 429 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1); in SimpleSum2DVerticalTaps() 431 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2); in SimpleSum2DVerticalTaps() 433 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3); in SimpleSum2DVerticalTaps() 436 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0); in SimpleSum2DVerticalTaps() 438 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1); in SimpleSum2DVerticalTaps() 440 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2); in SimpleSum2DVerticalTaps() 442 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3); in SimpleSum2DVerticalTaps() 445 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1); in SimpleSum2DVerticalTaps() [all …]
|
/external/libgav1/libgav1/src/dsp/x86/ |
D | intra_edge_sse4.cc | 222 __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo); in IntraEdgeUpsampler_SSE4_1() local 223 sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4)); in IntraEdgeUpsampler_SSE4_1() 224 sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6)); in IntraEdgeUpsampler_SSE4_1() 225 sum_lo = RightShiftWithRounding_S16(sum_lo, 4); in IntraEdgeUpsampler_SSE4_1() 226 const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo), in IntraEdgeUpsampler_SSE4_1()
|
D | convolve_sse4.cc | 1001 __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]); in Sum2DVerticalTaps() local 1006 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1])); in Sum2DVerticalTaps() 1012 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2])); in Sum2DVerticalTaps() 1018 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3])); in Sum2DVerticalTaps() 1024 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), in Sum2DVerticalTaps() 1029 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), in Sum2DVerticalTaps() 1039 __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]); in Sum2DVerticalTaps4x2() local 1044 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1])); in Sum2DVerticalTaps4x2() 1050 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2])); in Sum2DVerticalTaps4x2() 1056 sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3])); in Sum2DVerticalTaps4x2() [all …]
|
D | convolve_sse4.inc | 164 __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); 171 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 176 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 181 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 189 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), 195 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
|
D | convolve_avx2.cc | 428 __m256i sum_lo = in SimpleSum2DVerticalTaps() local 437 sum_lo = _mm256_add_epi32(sum_lo, madd_lo); in SimpleSum2DVerticalTaps() 444 sum_lo = _mm256_add_epi32(sum_lo, madd_lo); in SimpleSum2DVerticalTaps() 451 sum_lo = _mm256_add_epi32(sum_lo, madd_lo); in SimpleSum2DVerticalTaps() 459 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), in SimpleSum2DVerticalTaps() 465 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), in SimpleSum2DVerticalTaps()
|
D | loop_restoration_10bit_avx2.cc | 1316 const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128()); in CalculateMa() local 1318 const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); in CalculateMa() 1347 const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256()); in CalculateMa() local 1349 const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); in CalculateMa()
|
D | loop_restoration_avx2.cc | 1391 const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); in CalculateMa() local 1393 const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale); in CalculateMa() 1421 const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256()); in CalculateMa() local 1423 const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale); in CalculateMa()
|
D | loop_restoration_10bit_sse4.cc | 1017 const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128()); in CalculateMa() local 1019 const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); in CalculateMa()
|
D | loop_restoration_sse4.cc | 1158 const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); in CalculateMa() local 1160 const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale); in CalculateMa()
|
/external/libvpx/libvpx/vp8/common/x86/ |
D | bilinear_filter_sse2.c | 56 const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); in horizontal_16x16() local 59 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); in horizontal_16x16() 108 const __m128i sum_lo = in vertical_16x16() local 113 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); in vertical_16x16()
|
/external/libvpx/libvpx/vp9/common/arm/neon/ |
D | vp9_highbd_iht16x16_add_neon.c | 122 const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]); in highbd_add_dct_const_round_shift_low_8() local 126 out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS); in highbd_add_dct_const_round_shift_low_8() 127 out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS); in highbd_add_dct_const_round_shift_low_8()
|
D | vp9_highbd_iht8x8_add_neon.c | 72 const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); in highbd_add_dct_const_round_shift_low_8() local 74 const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); in highbd_add_dct_const_round_shift_low_8()
|
/external/libaom/libaom/aom_dsp/x86/ |
D | variance_avx2.c | 86 const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); in sum_to_32bit_avx2() local 89 return _mm256_add_epi32(sum_lo, sum_hi); in sum_to_32bit_avx2()
|
D | variance_sse2.c | 64 const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); in sum_to_32bit_sse2() local 66 return _mm_add_epi32(sum_lo, sum_hi); in sum_to_32bit_sse2()
|
D | highbd_intrapred_sse2.c | 426 const __m128i sum_lo = dc_sum_8(ref); in dc_sum_16() local 428 return _mm_add_epi16(sum_lo, sum_hi); in dc_sum_16()
|
/external/libvpx/libvpx/vpx_dsp/x86/ |
D | variance_sse2.c | 92 const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); in sum_to_32bit_sse2() local 94 return _mm_add_epi32(sum_lo, sum_hi); in sum_to_32bit_sse2()
|
D | highbd_intrapred_intrin_sse2.c | 270 const __m128i sum_lo = dc_sum_8(ref); in dc_sum_16() local 272 return _mm_add_epi16(sum_lo, sum_hi); in dc_sum_16()
|
D | variance_avx2.c | 95 const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); in sum_to_32bit_avx2() local 98 return _mm256_add_epi32(sum_lo, sum_hi); in sum_to_32bit_avx2()
|
/external/libvpx/libvpx/vp9/encoder/x86/ |
D | highbd_temporal_filter_sse4.c | 80 const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); in highbd_average_4() local 85 const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); in highbd_average_4()
|
/external/libaom/libaom/av1/encoder/x86/ |
D | temporal_filter_sse4.c | 1112 const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); in highbd_average_4() local 1117 const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); in highbd_average_4()
|