1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hz_8t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
16 int32_t src_stride,
17 uint8_t *dst,
18 int32_t dst_stride,
19 int8_t *filter) {
20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
21 v16u8 dst0, dst1, dst2, dst3, res2, res3;
22 v16u8 mask0, mask1, mask2, mask3;
23 v8i16 filt, res0, res1;
24
25 mask0 = LD_UB(&mc_filt_mask_arr[16]);
26 src -= 3;
27
28 /* rearranging filter */
29 filt = LD_SH(filter);
30 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
31
32 mask1 = mask0 + 2;
33 mask2 = mask0 + 4;
34 mask3 = mask0 + 6;
35
36 LD_SB4(src, src_stride, src0, src1, src2, src3);
37 XORI_B4_128_SB(src0, src1, src2, src3);
38 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
39 filt0, filt1, filt2, filt3, res0, res1);
40 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
41 SRARI_H2_SH(res0, res1, FILTER_BITS);
42 SAT_SH2_SH(res0, res1, 7);
43 PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
44 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
45 XORI_B2_128_UB(res2, res3);
46 AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
47 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
48 }
49
common_hz_8t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)50 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
51 int32_t src_stride,
52 uint8_t *dst,
53 int32_t dst_stride,
54 int8_t *filter) {
55 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
56 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
57 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
58 v8i16 filt, vec0, vec1, vec2, vec3;
59
60 mask0 = LD_UB(&mc_filt_mask_arr[16]);
61 src -= 3;
62
63 /* rearranging filter */
64 filt = LD_SH(filter);
65 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
66
67 mask1 = mask0 + 2;
68 mask2 = mask0 + 4;
69 mask3 = mask0 + 6;
70
71 LD_SB4(src, src_stride, src0, src1, src2, src3);
72 XORI_B4_128_SB(src0, src1, src2, src3);
73 src += (4 * src_stride);
74 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
75 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
76 filt0, filt1, filt2, filt3, vec0, vec1);
77 LD_SB4(src, src_stride, src0, src1, src2, src3);
78 XORI_B4_128_SB(src0, src1, src2, src3);
79 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
80 filt0, filt1, filt2, filt3, vec2, vec3);
81 SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
82 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
83 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
84 res3);
85 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
86 XORI_B2_128_UB(res0, res2);
87 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
88 dst6);
89 ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
90 AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
91 ST4x8_UB(res0, res2, dst, dst_stride);
92 }
93
common_hz_8t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)94 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
95 int32_t src_stride,
96 uint8_t *dst,
97 int32_t dst_stride,
98 int8_t *filter,
99 int32_t height) {
100 if (4 == height) {
101 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
102 } else if (8 == height) {
103 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
104 }
105 }
106
common_hz_8t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)107 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
108 int32_t src_stride,
109 uint8_t *dst,
110 int32_t dst_stride,
111 int8_t *filter,
112 int32_t height) {
113 int32_t loop_cnt;
114 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
115 v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
116 v8i16 filt, out0, out1, out2, out3;
117
118 mask0 = LD_UB(&mc_filt_mask_arr[0]);
119 src -= 3;
120
121 /* rearranging filter */
122 filt = LD_SH(filter);
123 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
124
125 mask1 = mask0 + 2;
126 mask2 = mask0 + 4;
127 mask3 = mask0 + 6;
128
129 for (loop_cnt = (height >> 2); loop_cnt--;) {
130 LD_SB4(src, src_stride, src0, src1, src2, src3);
131 XORI_B4_128_SB(src0, src1, src2, src3);
132 src += (4 * src_stride);
133 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
134 mask3, filt0, filt1, filt2, filt3, out0, out1,
135 out2, out3);
136 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
137 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
138 SAT_SH4_SH(out0, out1, out2, out3, 7);
139 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
140 dst, dst_stride);
141 dst += (4 * dst_stride);
142 }
143 }
144
common_hz_8t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)145 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
146 int32_t src_stride,
147 uint8_t *dst,
148 int32_t dst_stride,
149 int8_t *filter,
150 int32_t height) {
151 int32_t loop_cnt;
152 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
153 v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
154 v8i16 filt, out0, out1, out2, out3;
155 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
156 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
157
158 mask0 = LD_UB(&mc_filt_mask_arr[0]);
159 src -= 3;
160
161 /* rearranging filter */
162 filt = LD_SH(filter);
163 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
164
165 mask1 = mask0 + 2;
166 mask2 = mask0 + 4;
167 mask3 = mask0 + 6;
168
169 for (loop_cnt = height >> 1; loop_cnt--;) {
170 LD_SB2(src, src_stride, src0, src2);
171 LD_SB2(src + 8, src_stride, src1, src3);
172 src += (2 * src_stride);
173
174 XORI_B4_128_SB(src0, src1, src2, src3);
175 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
176 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
177 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
178 vec14);
179 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
180 vec15);
181 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
182 vec2, vec3);
183 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
184 vec9, vec10, vec11);
185 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
186 vec2, vec3);
187 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
188 vec9, vec10, vec11);
189 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
190 out2, out3);
191 LD_UB2(dst, dst_stride, dst0, dst1);
192 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
193 SAT_SH4_SH(out0, out1, out2, out3, 7);
194 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
195 dst += dst_stride;
196 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
197 dst += dst_stride;
198 }
199 }
200
common_hz_8t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)201 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
202 int32_t src_stride,
203 uint8_t *dst,
204 int32_t dst_stride,
205 int8_t *filter,
206 int32_t height) {
207 uint32_t loop_cnt;
208 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
209 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
210 v8i16 filt, out0, out1, out2, out3;
211 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
212 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
213
214 mask0 = LD_UB(&mc_filt_mask_arr[0]);
215 src -= 3;
216
217 /* rearranging filter */
218 filt = LD_SH(filter);
219 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
220
221 mask1 = mask0 + 2;
222 mask2 = mask0 + 4;
223 mask3 = mask0 + 6;
224
225 for (loop_cnt = height; loop_cnt--;) {
226 src0 = LD_SB(src);
227 src2 = LD_SB(src + 16);
228 src3 = LD_SB(src + 24);
229 src1 = __msa_sldi_b(src2, src0, 8);
230 src += src_stride;
231
232 XORI_B4_128_SB(src0, src1, src2, src3);
233 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
234 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
235 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
236 vec14);
237 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
238 vec15);
239 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
240 vec2, vec3);
241 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
242 vec9, vec10, vec11);
243 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
244 vec2, vec3);
245 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
246 vec9, vec10, vec11);
247 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
248 out2, out3);
249 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
250 SAT_SH4_SH(out0, out1, out2, out3, 7);
251 LD_UB2(dst, 16, dst1, dst2);
252 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
253 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
254 dst += dst_stride;
255 }
256 }
257
common_hz_8t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)258 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
259 int32_t src_stride,
260 uint8_t *dst,
261 int32_t dst_stride,
262 int8_t *filter,
263 int32_t height) {
264 uint32_t loop_cnt, cnt;
265 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
266 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
267 v8i16 filt, out0, out1, out2, out3;
268 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
269 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
270
271 mask0 = LD_UB(&mc_filt_mask_arr[0]);
272 src -= 3;
273
274 /* rearranging filter */
275 filt = LD_SH(filter);
276 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
277
278 mask1 = mask0 + 2;
279 mask2 = mask0 + 4;
280 mask3 = mask0 + 6;
281
282 for (loop_cnt = height; loop_cnt--;) {
283 for (cnt = 0; cnt < 2; ++cnt) {
284 src0 = LD_SB(&src[cnt << 5]);
285 src2 = LD_SB(&src[16 + (cnt << 5)]);
286 src3 = LD_SB(&src[24 + (cnt << 5)]);
287 src1 = __msa_sldi_b(src2, src0, 8);
288
289 XORI_B4_128_SB(src0, src1, src2, src3);
290 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
291 vec12);
292 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
293 vec13);
294 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
295 vec14);
296 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
297 vec15);
298 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
299 vec1, vec2, vec3);
300 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
301 vec9, vec10, vec11);
302 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
303 vec1, vec2, vec3);
304 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
305 vec9, vec10, vec11);
306 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
307 out2, out3);
308 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
309 SAT_SH4_SH(out0, out1, out2, out3, 7);
310 LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
311 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
312 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
313 }
314
315 src += src_stride;
316 dst += dst_stride;
317 }
318 }
319
common_hz_2t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)320 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
321 int32_t src_stride,
322 uint8_t *dst,
323 int32_t dst_stride,
324 int8_t *filter) {
325 v16i8 src0, src1, src2, src3, mask;
326 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
327 v8u16 vec2, vec3, filt;
328
329 mask = LD_SB(&mc_filt_mask_arr[16]);
330
331 /* rearranging filter */
332 filt = LD_UH(filter);
333 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
334
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
336 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
337 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
338 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
339 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
340 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
341 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
342 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
343 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
344 }
345
common_hz_2t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)346 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
347 int32_t src_stride,
348 uint8_t *dst,
349 int32_t dst_stride,
350 int8_t *filter) {
351 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
352 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
353 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
354 v8u16 vec4, vec5, vec6, vec7, filt;
355
356 mask = LD_SB(&mc_filt_mask_arr[16]);
357
358 /* rearranging filter */
359 filt = LD_UH(filter);
360 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
361
362 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
363 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
364 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
365 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
366 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
367 vec6, vec7);
368 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
369 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
370 res3);
371 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
372 dst6);
373 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
374 res3);
375 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
376 dst += (4 * dst_stride);
377 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
378 }
379
common_hz_2t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)380 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
381 int32_t src_stride,
382 uint8_t *dst,
383 int32_t dst_stride,
384 int8_t *filter,
385 int32_t height) {
386 if (4 == height) {
387 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
388 } else if (8 == height) {
389 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
390 }
391 }
392
common_hz_2t_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)393 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
394 int32_t src_stride,
395 uint8_t *dst,
396 int32_t dst_stride,
397 int8_t *filter) {
398 v16i8 src0, src1, src2, src3, mask;
399 v16u8 filt0, dst0, dst1, dst2, dst3;
400 v8u16 vec0, vec1, vec2, vec3, filt;
401
402 mask = LD_SB(&mc_filt_mask_arr[0]);
403
404 /* rearranging filter */
405 filt = LD_UH(filter);
406 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
407
408 LD_SB4(src, src_stride, src0, src1, src2, src3);
409 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
410 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
411 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
412 vec2, vec3);
413 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
414 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
415 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
416 dst, dst_stride);
417 }
418
common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)419 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
420 int32_t src_stride,
421 uint8_t *dst,
422 int32_t dst_stride,
423 int8_t *filter,
424 int32_t height) {
425 v16i8 src0, src1, src2, src3, mask;
426 v16u8 filt0, dst0, dst1, dst2, dst3;
427 v8u16 vec0, vec1, vec2, vec3, filt;
428
429 mask = LD_SB(&mc_filt_mask_arr[0]);
430
431 /* rearranging filter */
432 filt = LD_UH(filter);
433 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
434
435 LD_SB4(src, src_stride, src0, src1, src2, src3);
436 src += (4 * src_stride);
437 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
438 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
439 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
440 vec2, vec3);
441 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
442 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
443 LD_SB4(src, src_stride, src0, src1, src2, src3);
444 src += (4 * src_stride);
445 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
446 dst, dst_stride);
447 dst += (4 * dst_stride);
448
449 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
450 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
451 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
452 vec2, vec3);
453 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
454 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
455 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
456 dst, dst_stride);
457 dst += (4 * dst_stride);
458
459 if (16 == height) {
460 LD_SB4(src, src_stride, src0, src1, src2, src3);
461 src += (4 * src_stride);
462
463 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
464 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
465 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
466 vec2, vec3);
467 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
468 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
469 LD_SB4(src, src_stride, src0, src1, src2, src3);
470 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
471 dst, dst_stride);
472 dst += (4 * dst_stride);
473
474 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
475 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
476 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
477 vec2, vec3);
478 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
479 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
480 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
481 dst, dst_stride);
482 }
483 }
484
common_hz_2t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)485 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
486 int32_t src_stride,
487 uint8_t *dst,
488 int32_t dst_stride,
489 int8_t *filter,
490 int32_t height) {
491 if (4 == height) {
492 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
493 } else {
494 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
495 filter, height);
496 }
497 }
498
common_hz_2t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)499 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
500 int32_t src_stride,
501 uint8_t *dst,
502 int32_t dst_stride,
503 int8_t *filter,
504 int32_t height) {
505 uint32_t loop_cnt;
506 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
507 v16u8 filt0, dst0, dst1, dst2, dst3;
508 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
509 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
510
511 mask = LD_SB(&mc_filt_mask_arr[0]);
512
513 /* rearranging filter */
514 filt = LD_UH(filter);
515 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
516
517 LD_SB4(src, src_stride, src0, src2, src4, src6);
518 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
519 src += (4 * src_stride);
520
521 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
522 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
523 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
524 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
526 res2, res3);
527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
528 res6, res7);
529 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
530 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
531 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
532 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
533 dst += dst_stride;
534 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
535 dst += dst_stride;
536 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
537 dst += dst_stride;
538 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
539 dst += dst_stride;
540
541 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
542 LD_SB4(src, src_stride, src0, src2, src4, src6);
543 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
544 src += (4 * src_stride);
545
546 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
547 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
548 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
549 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
550 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
551 res2, res3);
552 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
553 res6, res7);
554 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
555 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
556 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
557 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
558 dst += dst_stride;
559 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
560 dst += dst_stride;
561 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
562 dst += dst_stride;
563 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
564 dst += dst_stride;
565 }
566 }
567
common_hz_2t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)568 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
569 int32_t src_stride,
570 uint8_t *dst,
571 int32_t dst_stride,
572 int8_t *filter,
573 int32_t height) {
574 uint32_t loop_cnt;
575 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
576 v16u8 filt0, dst0, dst1, dst2, dst3;
577 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
578 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
579
580 mask = LD_SB(&mc_filt_mask_arr[0]);
581
582 /* rearranging filter */
583 filt = LD_UH(filter);
584 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
585
586 for (loop_cnt = (height >> 1); loop_cnt--;) {
587 src0 = LD_SB(src);
588 src2 = LD_SB(src + 16);
589 src3 = LD_SB(src + 24);
590 src1 = __msa_sldi_b(src2, src0, 8);
591 src += src_stride;
592 src4 = LD_SB(src);
593 src6 = LD_SB(src + 16);
594 src7 = LD_SB(src + 24);
595 src5 = __msa_sldi_b(src6, src4, 8);
596 src += src_stride;
597
598 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
599 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
600 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
601 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
602 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
603 res2, res3);
604 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
605 res6, res7);
606 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
607 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
608 LD_UB2(dst, 16, dst0, dst1);
609 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
610 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
611 dst += dst_stride;
612 LD_UB2(dst, 16, dst2, dst3);
613 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
614 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
615 dst += dst_stride;
616 }
617 }
618
common_hz_2t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)619 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
620 int32_t src_stride,
621 uint8_t *dst,
622 int32_t dst_stride,
623 int8_t *filter,
624 int32_t height) {
625 uint32_t loop_cnt;
626 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
627 v16u8 filt0, dst0, dst1, dst2, dst3;
628 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
629 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
630
631 mask = LD_SB(&mc_filt_mask_arr[0]);
632
633 /* rearranging filter */
634 filt = LD_UH(filter);
635 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
636
637 for (loop_cnt = height; loop_cnt--;) {
638 LD_SB4(src, 16, src0, src2, src4, src6);
639 src7 = LD_SB(src + 56);
640 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
641 src += src_stride;
642
643 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
644 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
645 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
646 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
647 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
648 out2, out3);
649 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
650 out6, out7);
651 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
652 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
653 LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
654 PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
655 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
656 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
657 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
658 dst += dst_stride;
659 }
660 }
661
vpx_convolve8_avg_horiz_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)662 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
663 uint8_t *dst, ptrdiff_t dst_stride,
664 const int16_t *filter_x, int x_step_q4,
665 const int16_t *filter_y, int y_step_q4,
666 int w, int h) {
667 int8_t cnt, filt_hor[8];
668
669 assert(x_step_q4 == 16);
670 assert(((const int32_t *)filter_x)[1] != 0x800000);
671
672 for (cnt = 0; cnt < 8; ++cnt) {
673 filt_hor[cnt] = filter_x[cnt];
674 }
675
676 if (((const int32_t *)filter_x)[0] == 0) {
677 switch (w) {
678 case 4:
679 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
680 dst, (int32_t)dst_stride,
681 &filt_hor[3], h);
682 break;
683 case 8:
684 common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
685 dst, (int32_t)dst_stride,
686 &filt_hor[3], h);
687 break;
688 case 16:
689 common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
690 dst, (int32_t)dst_stride,
691 &filt_hor[3], h);
692 break;
693 case 32:
694 common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
695 dst, (int32_t)dst_stride,
696 &filt_hor[3], h);
697 break;
698 case 64:
699 common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
700 dst, (int32_t)dst_stride,
701 &filt_hor[3], h);
702 break;
703 default:
704 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
705 filter_x, x_step_q4, filter_y, y_step_q4,
706 w, h);
707 break;
708 }
709 } else {
710 switch (w) {
711 case 4:
712 common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
713 dst, (int32_t)dst_stride,
714 filt_hor, h);
715 break;
716 case 8:
717 common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
718 dst, (int32_t)dst_stride,
719 filt_hor, h);
720 break;
721 case 16:
722 common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
723 dst, (int32_t)dst_stride,
724 filt_hor, h);
725 break;
726 case 32:
727 common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
728 dst, (int32_t)dst_stride,
729 filt_hor, h);
730 break;
731 case 64:
732 common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
733 dst, (int32_t)dst_stride,
734 filt_hor, h);
735 break;
736 default:
737 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
738 filter_x, x_step_q4, filter_y, y_step_q4,
739 w, h);
740 break;
741 }
742 }
743 }
744