1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
13 #define AOM_AOM_DSP_X86_BLEND_SSE4_H_
14
15 #include "aom_dsp/blend.h"
16 #include "aom_dsp/x86/synonyms.h"
17 static const uint8_t g_blend_a64_mask_shuffle[32] = {
18 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
19 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
20 };
21
22 //////////////////////////////////////////////////////////////////////////////
23 // Common kernels
24 //////////////////////////////////////////////////////////////////////////////
25
blend_4(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)26 static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
27 const __m128i *v_m0_w, const __m128i *v_m1_w) {
28 const __m128i v_s0_b = xx_loadl_32(src0);
29 const __m128i v_s1_b = xx_loadl_32(src1);
30 const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
31 const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
32
33 const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
34 const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
35 const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
36 const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
37
38 return v_res_w;
39 }
40
blend_8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)41 static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
42 const __m128i *v_m0_w, const __m128i *v_m1_w) {
43 const __m128i v_s0_b = xx_loadl_64(src0);
44 const __m128i v_s1_b = xx_loadl_64(src1);
45 const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
46 const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
47
48 const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
49 const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
50
51 const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
52
53 const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54
55 return v_res_w;
56 }
57
blend_4_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)58 static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
59 const __m128i *v_m0_b, const __m128i *v_m1_b,
60 const __m128i *rounding) {
61 const __m128i v_s0_b = xx_loadl_32(src0);
62 const __m128i v_s1_b = xx_loadl_32(src1);
63
64 const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65 _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66
67 const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68 const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69 return v_res;
70 }
71
blend_8_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)72 static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
73 const __m128i *v_m0_b, const __m128i *v_m1_b,
74 const __m128i *rounding) {
75 const __m128i v_s0_b = xx_loadl_64(src0);
76 const __m128i v_s1_b = xx_loadl_64(src1);
77
78 const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79 _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80
81 const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82 const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83 return v_res;
84 }
85
blend_16_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)86 static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
87 const __m128i *v_m0_b, const __m128i *v_m1_b,
88 const __m128i *rounding) {
89 const __m128i v_s0_b = xx_loadu_128(src0);
90 const __m128i v_s1_b = xx_loadu_128(src1);
91
92 const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93 _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94 const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95 _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96
97 const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98 const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99 const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100 return v_res;
101 }
102
103 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
104 const __m128i v_m0_w, const __m128i v_m1_w);
105
blend_4_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)106 static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
107 const __m128i v_m0_w, const __m128i v_m1_w) {
108 const __m128i v_s0_w = xx_loadl_64(src0);
109 const __m128i v_s1_w = xx_loadl_64(src1);
110
111 const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112 const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113
114 const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115
116 const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117
118 return v_res_w;
119 }
120
blend_8_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)121 static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
122 const __m128i v_m0_w, const __m128i v_m1_w) {
123 const __m128i v_s0_w = xx_loadu_128(src0);
124 const __m128i v_s1_w = xx_loadu_128(src1);
125
126 const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127 const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128
129 const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130
131 const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132
133 return v_res_w;
134 }
135
blend_4_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)136 static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
137 const __m128i v_m0_w, const __m128i v_m1_w) {
138 const __m128i v_s0_w = xx_loadl_64(src0);
139 const __m128i v_s1_w = xx_loadl_64(src1);
140
141 // Interleave
142 const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143 const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144
145 // Multiply-Add
146 const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147
148 // Scale
149 const __m128i v_ssum_d =
150 _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151
152 // Pack
153 const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154
155 // Round
156 const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157
158 return v_res_w;
159 }
160
blend_8_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)161 static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
162 const __m128i v_m0_w, const __m128i v_m1_w) {
163 const __m128i v_s0_w = xx_loadu_128(src0);
164 const __m128i v_s1_w = xx_loadu_128(src1);
165
166 // Interleave
167 const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168 const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169 const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170 const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171
172 // Multiply-Add
173 const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174 const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175
176 // Scale
177 const __m128i v_ssuml_d =
178 _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179 const __m128i v_ssumh_d =
180 _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181
182 // Pack
183 const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184
185 // Round
186 const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187
188 return v_res_w;
189 }
190
191 #endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
192