1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)15 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
16 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
17 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
18 uint32_t loop_cnt;
19 uint32_t tp0, tp1, tp2, tp3;
20 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
21 v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
22 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
23 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
24 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
25 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
26
27 mask0 = LD_UB(&mc_filt_mask_arr[16]);
28 src -= (3 + 3 * src_stride);
29
30 /* rearranging filter */
31 filt = LD_SH(filter_horiz);
32 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
33
34 mask1 = mask0 + 2;
35 mask2 = mask0 + 4;
36 mask3 = mask0 + 6;
37
38 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
39 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
40 src += (7 * src_stride);
41
42 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
43 filt_hz1, filt_hz2, filt_hz3);
44 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
45 filt_hz1, filt_hz2, filt_hz3);
46 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
47 filt_hz1, filt_hz2, filt_hz3);
48 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
49 filt_hz1, filt_hz2, filt_hz3);
50 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
51
52 filt = LD_SH(filter_vert);
53 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
54
55 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
56 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
57
58 for (loop_cnt = (height >> 2); loop_cnt--;) {
59 LD_SB4(src, src_stride, src7, src8, src9, src10);
60 XORI_B4_128_SB(src7, src8, src9, src10);
61 src += (4 * src_stride);
62
63 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
64 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
65 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
66 filt_hz1, filt_hz2, filt_hz3);
67 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
68 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
69 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
70 filt_vt2, filt_vt3);
71
72 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
73 filt_hz1, filt_hz2, filt_hz3);
74 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
75 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
76 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
77 filt_vt2, filt_vt3);
78
79 SRARI_H2_SH(res0, res1, FILTER_BITS);
80 SAT_SH2_SH(res0, res1, 7);
81 res = PCKEV_XORI128_UB(res0, res1);
82 res = (v16u8)__msa_aver_u_b(res, dst0);
83 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
84 dst += (4 * dst_stride);
85
86 hz_out5 = hz_out9;
87 vec0 = vec2;
88 vec1 = vec3;
89 vec2 = vec4;
90 }
91 }
92
common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)93 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
94 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
95 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
96 uint32_t loop_cnt;
97 uint64_t tp0, tp1, tp2, tp3;
98 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
99 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
100 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
101 v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
102 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
103 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
104 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
105
106 mask0 = LD_UB(&mc_filt_mask_arr[0]);
107 src -= (3 + 3 * src_stride);
108
109 /* rearranging filter */
110 filt = LD_SH(filter_horiz);
111 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
112
113 mask1 = mask0 + 2;
114 mask2 = mask0 + 4;
115 mask3 = mask0 + 6;
116
117 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
118 src += (7 * src_stride);
119
120 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
121 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
122 filt_hz1, filt_hz2, filt_hz3);
123 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
124 filt_hz1, filt_hz2, filt_hz3);
125 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
126 filt_hz1, filt_hz2, filt_hz3);
127 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
128 filt_hz1, filt_hz2, filt_hz3);
129 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
130 filt_hz1, filt_hz2, filt_hz3);
131 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
132 filt_hz1, filt_hz2, filt_hz3);
133 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
134 filt_hz1, filt_hz2, filt_hz3);
135
136 filt = LD_SH(filter_vert);
137 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
138
139 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
140 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
141 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
142
143 for (loop_cnt = (height >> 2); loop_cnt--;) {
144 LD_SB4(src, src_stride, src7, src8, src9, src10);
145 XORI_B4_128_SB(src7, src8, src9, src10);
146 src += (4 * src_stride);
147
148 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
149 INSERT_D2_UB(tp0, tp1, dst0);
150 INSERT_D2_UB(tp2, tp3, dst1);
151
152 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
153 filt_hz1, filt_hz2, filt_hz3);
154 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
155 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
156 filt_vt2, filt_vt3);
157
158 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
159 filt_hz1, filt_hz2, filt_hz3);
160 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
161 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
162 filt_vt2, filt_vt3);
163
164 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
165 filt_hz1, filt_hz2, filt_hz3);
166 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
167 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
168 filt_vt2, filt_vt3);
169
170 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
171 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
172 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
173 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
174 filt_vt2, filt_vt3);
175
176 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
177 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
178 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
179 dst_stride);
180 dst += (4 * dst_stride);
181
182 hz_out6 = hz_out10;
183 out0 = out2;
184 out1 = out3;
185 out2 = out8;
186 out4 = out6;
187 out5 = out7;
188 out6 = out9;
189 }
190 }
191
common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)192 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
193 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
194 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
195 int32_t multiple8_cnt;
196 for (multiple8_cnt = 2; multiple8_cnt--;) {
197 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
198 filter_horiz, filter_vert, height);
199 src += 8;
200 dst += 8;
201 }
202 }
203
common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)204 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
205 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
206 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
207 int32_t multiple8_cnt;
208 for (multiple8_cnt = 4; multiple8_cnt--;) {
209 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
210 filter_horiz, filter_vert, height);
211 src += 8;
212 dst += 8;
213 }
214 }
215
common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)216 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
217 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
218 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
219 int32_t multiple8_cnt;
220 for (multiple8_cnt = 8; multiple8_cnt--;) {
221 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
222 filter_horiz, filter_vert, height);
223 src += 8;
224 dst += 8;
225 }
226 }
227
common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)228 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
229 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
230 int8_t *filter_horiz, int8_t *filter_vert) {
231 uint32_t tp0, tp1, tp2, tp3;
232 v16i8 src0, src1, src2, src3, src4, mask;
233 v16u8 filt_hz, filt_vt, vec0, vec1;
234 v16u8 dst0 = { 0 }, out;
235 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
236
237 mask = LD_SB(&mc_filt_mask_arr[16]);
238
239 /* rearranging filter */
240 filt = LD_UH(filter_horiz);
241 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
242
243 filt = LD_UH(filter_vert);
244 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
245
246 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
247
248 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
249 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
250 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
251 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
252 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
253 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
254
255 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
256 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
257 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
258 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
259 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
260 out = __msa_aver_u_b(out, dst0);
261 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
262 }
263
common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)264 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
265 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
266 int8_t *filter_horiz, int8_t *filter_vert) {
267 uint32_t tp0, tp1, tp2, tp3;
268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
269 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
270 v16u8 dst0 = { 0 }, dst1 = { 0 };
271 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
272 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
273 v8i16 filt;
274
275 mask = LD_SB(&mc_filt_mask_arr[16]);
276
277 /* rearranging filter */
278 filt = LD_SH(filter_horiz);
279 filt_hz = (v16u8)__msa_splati_h(filt, 0);
280
281 filt = LD_SH(filter_vert);
282 filt_vt = (v16u8)__msa_splati_h(filt, 0);
283
284 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
285 src += (8 * src_stride);
286 src8 = LD_SB(src);
287
288 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
289 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
290 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
291 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
292 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
293 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
294 hz_out3, hz_out5, 8);
295 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
296
297 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
298 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
299 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
300 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
301 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
302 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
303 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
304 tmp1, tmp2, tmp3);
305 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
306 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
307 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
308 ST4x8_UB(res0, res1, dst, dst_stride);
309 }
310
common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)311 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
312 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
313 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
314 if (4 == height) {
315 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
316 filter_horiz, filter_vert);
317 } else if (8 == height) {
318 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
319 filter_horiz, filter_vert);
320 }
321 }
322
common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)323 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
324 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
325 int8_t *filter_horiz, int8_t *filter_vert) {
326 uint64_t tp0, tp1, tp2, tp3;
327 v16i8 src0, src1, src2, src3, src4, mask;
328 v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
329 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
330 v8i16 filt;
331
332 mask = LD_SB(&mc_filt_mask_arr[0]);
333
334 /* rearranging filter */
335 filt = LD_SH(filter_horiz);
336 filt_hz = (v16u8)__msa_splati_h(filt, 0);
337
338 filt = LD_SH(filter_vert);
339 filt_vt = (v16u8)__msa_splati_h(filt, 0);
340
341 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
342 src += (5 * src_stride);
343
344 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
345 INSERT_D2_UB(tp0, tp1, dst0);
346 INSERT_D2_UB(tp2, tp3, dst1);
347 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
348 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
349 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
350 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
351
352 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
353 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
354 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
355
356 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
357 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
358 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
359
360 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
361 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
362 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
363
364 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
365 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
366 }
367
common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)368 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
369 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
370 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
371 uint32_t loop_cnt;
372 uint64_t tp0, tp1, tp2, tp3;
373 v16i8 src0, src1, src2, src3, src4, mask;
374 v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
375 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
376 v8i16 filt;
377
378 mask = LD_SB(&mc_filt_mask_arr[0]);
379
380 /* rearranging filter */
381 filt = LD_SH(filter_horiz);
382 filt_hz = (v16u8)__msa_splati_h(filt, 0);
383
384 filt = LD_SH(filter_vert);
385 filt_vt = (v16u8)__msa_splati_h(filt, 0);
386
387 src0 = LD_SB(src);
388 src += src_stride;
389
390 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
391
392 for (loop_cnt = (height >> 2); loop_cnt--;) {
393 LD_SB4(src, src_stride, src1, src2, src3, src4);
394 src += (4 * src_stride);
395
396 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
397 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
398 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
399
400 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
401 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
402 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
403
404 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
405
406 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
407 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
408 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
409
410 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
411 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
412 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
413
414 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
415 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
416 INSERT_D2_UB(tp0, tp1, dst0);
417 INSERT_D2_UB(tp2, tp3, dst1);
418 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
419 dst += (4 * dst_stride);
420 }
421 }
422
common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)423 static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
424 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
425 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
426 if (4 == height) {
427 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
428 filter_horiz, filter_vert);
429 } else {
430 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
431 src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
432 }
433 }
434
common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)435 static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
436 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
437 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
438 uint32_t loop_cnt;
439 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
440 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
441 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
442 v8i16 filt;
443
444 mask = LD_SB(&mc_filt_mask_arr[0]);
445
446 /* rearranging filter */
447 filt = LD_SH(filter_horiz);
448 filt_hz = (v16u8)__msa_splati_h(filt, 0);
449
450 filt = LD_SH(filter_vert);
451 filt_vt = (v16u8)__msa_splati_h(filt, 0);
452
453 LD_SB2(src, 8, src0, src1);
454 src += src_stride;
455
456 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
457 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
458
459 for (loop_cnt = (height >> 2); loop_cnt--;) {
460 LD_SB4(src, src_stride, src0, src2, src4, src6);
461 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
462 src += (4 * src_stride);
463 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
464
465 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
466 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
467 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
468 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
470 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
471 dst += dst_stride;
472
473 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
474 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
475 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
476 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
478 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
479 dst += dst_stride;
480
481 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
482 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
483 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
484 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
485 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
486 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
487 dst += dst_stride;
488
489 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
490 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
491 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
492 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
493 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
494 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
495 dst += dst_stride;
496 }
497 }
498
common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)499 static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
500 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
501 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
502 int32_t multiple8_cnt;
503 for (multiple8_cnt = 2; multiple8_cnt--;) {
504 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
505 filter_horiz, filter_vert, height);
506 src += 16;
507 dst += 16;
508 }
509 }
510
common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)511 static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
512 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
513 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
514 int32_t multiple8_cnt;
515 for (multiple8_cnt = 4; multiple8_cnt--;) {
516 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
517 filter_horiz, filter_vert, height);
518 src += 16;
519 dst += 16;
520 }
521 }
522
vpx_convolve8_avg_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)523 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
524 uint8_t *dst, ptrdiff_t dst_stride,
525 const InterpKernel *filter, int x0_q4, int x_step_q4,
526 int y0_q4, int y_step_q4, int w, int h) {
527 const int16_t *const filter_x = filter[x0_q4];
528 const int16_t *const filter_y = filter[y0_q4];
529 int8_t cnt, filt_hor[8], filt_ver[8];
530
531 assert(x_step_q4 == 16);
532 assert(y_step_q4 == 16);
533 assert(((const int32_t *)filter_x)[1] != 0x800000);
534 assert(((const int32_t *)filter_y)[1] != 0x800000);
535
536 for (cnt = 0; cnt < 8; ++cnt) {
537 filt_hor[cnt] = filter_x[cnt];
538 filt_ver[cnt] = filter_y[cnt];
539 }
540
541 if (vpx_get_filter_taps(filter_x) == 2 &&
542 vpx_get_filter_taps(filter_y) == 2) {
543 switch (w) {
544 case 4:
545 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
546 (int32_t)dst_stride, &filt_hor[3],
547 &filt_ver[3], h);
548 break;
549 case 8:
550 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
551 (int32_t)dst_stride, &filt_hor[3],
552 &filt_ver[3], h);
553 break;
554 case 16:
555 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
556 (int32_t)dst_stride,
557 &filt_hor[3], &filt_ver[3], h);
558 break;
559 case 32:
560 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
561 (int32_t)dst_stride,
562 &filt_hor[3], &filt_ver[3], h);
563 break;
564 case 64:
565 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
566 (int32_t)dst_stride,
567 &filt_hor[3], &filt_ver[3], h);
568 break;
569 default:
570 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
571 x_step_q4, y0_q4, y_step_q4, w, h);
572 break;
573 }
574 } else if (vpx_get_filter_taps(filter_x) == 2 ||
575 vpx_get_filter_taps(filter_y) == 2) {
576 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
577 x_step_q4, y0_q4, y_step_q4, w, h);
578 } else {
579 switch (w) {
580 case 4:
581 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
582 (int32_t)dst_stride, filt_hor,
583 filt_ver, h);
584 break;
585 case 8:
586 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
587 (int32_t)dst_stride, filt_hor,
588 filt_ver, h);
589 break;
590 case 16:
591 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
592 (int32_t)dst_stride, filt_hor,
593 filt_ver, h);
594 break;
595 case 32:
596 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
597 (int32_t)dst_stride, filt_hor,
598 filt_ver, h);
599 break;
600 case 64:
601 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
602 (int32_t)dst_stride, filt_hor,
603 filt_ver, h);
604 break;
605 default:
606 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
607 x_step_q4, y0_q4, y_step_q4, w, h);
608 break;
609 }
610 }
611 }
612