1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vp8/common/filter.h"
14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
15 
16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
17   { 0, -6, 123, 12, -1, 0, 0, 0 },
18   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
19   { 0, -9, 93, 50, -6, 0, 0, 0 },
20   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
21   { 0, -6, 50, 93, -9, 0, 0, 0 },
22   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
23   { 0, -1, 12, 123, -6, 0, 0, 0 },
24 };
25 
26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
27   /* 8 width cases */
28   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29   /* 4 width cases */
30   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31   /* 4 width cases */
32   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34 
35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
36                         filt_h2)                                           \
37   ({                                                                       \
38     v16i8 vec0_m, vec1_m, vec2_m;                                          \
39     v8i16 hz_out_m;                                                        \
40                                                                            \
41     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
42                vec0_m, vec1_m, vec2_m);                                    \
43     hz_out_m =                                                             \
44         DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
45                                                                            \
46     hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
47     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
48                                                                            \
49     hz_out_m;                                                              \
50   })
51 
52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
53                                    mask2, filt0, filt1, filt2, out0, out1) \
54   {                                                                        \
55     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
56                                                                            \
57     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
58     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
59     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
60     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
61     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
62     DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
63   }
64 
65 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
66                                    mask2, filt0, filt1, filt2, out0, out1,   \
67                                    out2, out3)                               \
68   {                                                                          \
69     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
70                                                                              \
71     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
72     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
73     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
74                 out0, out1, out2, out3);                                     \
75     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
76     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
77     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
78     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
79     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
80                  out0, out1, out2, out3);                                    \
81     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
82                  out0, out1, out2, out3);                                    \
83   }
84 
85 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
86   ({                                                         \
87     v8i16 tmp0;                                              \
88                                                              \
89     tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
90     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
91                                                              \
92     tmp0;                                                    \
93   })
94 
95 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
96   ({                                                                  \
97     v16i8 vec0_m, vec1_m;                                             \
98     v8i16 hz_out_m;                                                   \
99                                                                       \
100     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
101     hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
102                                                                       \
103     hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
104     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
105                                                                       \
106     hz_out_m;                                                         \
107   })
108 
109 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
110                                    filt0, filt1, out0, out1)             \
111   {                                                                      \
112     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
113                                                                          \
114     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
115     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
116     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
117     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
118   }
119 
120 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
121                                    filt0, filt1, out0, out1, out2, out3)     \
122   {                                                                          \
123     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
124                                                                              \
125     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
126     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
127     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
128                 out0, out1, out2, out3);                                     \
129     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
130     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
131     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
132                  out0, out1, out2, out3);                                    \
133   }
134 
common_hz_6t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)135 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
136                                  uint8_t *RESTRICT dst, int32_t dst_stride,
137                                  const int8_t *filter) {
138   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
139   v16u8 mask0, mask1, mask2, out;
140   v8i16 filt, out0, out1;
141 
142   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
143   src -= 2;
144 
145   filt = LD_SH(filter);
146   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
147 
148   mask1 = mask0 + 2;
149   mask2 = mask0 + 4;
150 
151   LD_SB4(src, src_stride, src0, src1, src2, src3);
152   XORI_B4_128_SB(src0, src1, src2, src3);
153   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
154                              filt1, filt2, out0, out1);
155   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
156   SAT_SH2_SH(out0, out1, 7);
157   out = PCKEV_XORI128_UB(out0, out1);
158   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
159 }
160 
common_hz_6t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)161 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
162                                  uint8_t *RESTRICT dst, int32_t dst_stride,
163                                  const int8_t *filter) {
164   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
165   v16u8 mask0, mask1, mask2, out;
166   v8i16 filt, out0, out1, out2, out3;
167 
168   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
169   src -= 2;
170 
171   filt = LD_SH(filter);
172   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173 
174   mask1 = mask0 + 2;
175   mask2 = mask0 + 4;
176 
177   LD_SB4(src, src_stride, src0, src1, src2, src3);
178   XORI_B4_128_SB(src0, src1, src2, src3);
179   src += (4 * src_stride);
180   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
181                              filt1, filt2, out0, out1);
182   LD_SB4(src, src_stride, src0, src1, src2, src3);
183   XORI_B4_128_SB(src0, src1, src2, src3);
184   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
185                              filt1, filt2, out2, out3);
186   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
187   SAT_SH4_SH(out0, out1, out2, out3, 7);
188   out = PCKEV_XORI128_UB(out0, out1);
189   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
190   dst += (4 * dst_stride);
191   out = PCKEV_XORI128_UB(out2, out3);
192   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
193 }
194 
common_hz_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)195 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
196                                 uint8_t *RESTRICT dst, int32_t dst_stride,
197                                 const int8_t *filter, int32_t height) {
198   if (4 == height) {
199     common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
200   } else if (8 == height) {
201     common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
202   }
203 }
204 
common_hz_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)205 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
206                                 uint8_t *RESTRICT dst, int32_t dst_stride,
207                                 const int8_t *filter, int32_t height) {
208   uint32_t loop_cnt;
209   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
210   v16u8 mask0, mask1, mask2, tmp0, tmp1;
211   v8i16 filt, out0, out1, out2, out3;
212 
213   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
214   src -= 2;
215 
216   filt = LD_SH(filter);
217   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
218 
219   mask1 = mask0 + 2;
220   mask2 = mask0 + 4;
221 
222   LD_SB4(src, src_stride, src0, src1, src2, src3);
223   XORI_B4_128_SB(src0, src1, src2, src3);
224   src += (4 * src_stride);
225   HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
226                              filt1, filt2, out0, out1, out2, out3);
227   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
228   SAT_SH4_SH(out0, out1, out2, out3, 7);
229   tmp0 = PCKEV_XORI128_UB(out0, out1);
230   tmp1 = PCKEV_XORI128_UB(out2, out3);
231   ST8x4_UB(tmp0, tmp1, dst, dst_stride);
232   dst += (4 * dst_stride);
233 
234   for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
235     LD_SB4(src, src_stride, src0, src1, src2, src3);
236     XORI_B4_128_SB(src0, src1, src2, src3);
237     src += (4 * src_stride);
238     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
239                                filt0, filt1, filt2, out0, out1, out2, out3);
240     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
241     SAT_SH4_SH(out0, out1, out2, out3, 7);
242     tmp0 = PCKEV_XORI128_UB(out0, out1);
243     tmp1 = PCKEV_XORI128_UB(out2, out3);
244     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
245     dst += (4 * dst_stride);
246   }
247 }
248 
common_hz_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)249 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
250                                  uint8_t *RESTRICT dst, int32_t dst_stride,
251                                  const int8_t *filter, int32_t height) {
252   uint32_t loop_cnt;
253   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
254   v16u8 mask0, mask1, mask2, out;
255   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
256 
257   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
258   src -= 2;
259 
260   filt = LD_SH(filter);
261   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
262 
263   mask1 = mask0 + 2;
264   mask2 = mask0 + 4;
265 
266   for (loop_cnt = (height >> 2); loop_cnt--;) {
267     LD_SB4(src, src_stride, src0, src2, src4, src6);
268     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
269     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
270     src += (4 * src_stride);
271 
272     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273                                filt0, filt1, filt2, out0, out1, out2, out3);
274     HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
275                                filt0, filt1, filt2, out4, out5, out6, out7);
276     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
277     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
278     SAT_SH4_SH(out0, out1, out2, out3, 7);
279     SAT_SH4_SH(out4, out5, out6, out7, 7);
280     out = PCKEV_XORI128_UB(out0, out1);
281     ST_UB(out, dst);
282     dst += dst_stride;
283     out = PCKEV_XORI128_UB(out2, out3);
284     ST_UB(out, dst);
285     dst += dst_stride;
286     out = PCKEV_XORI128_UB(out4, out5);
287     ST_UB(out, dst);
288     dst += dst_stride;
289     out = PCKEV_XORI128_UB(out6, out7);
290     ST_UB(out, dst);
291     dst += dst_stride;
292   }
293 }
294 
common_vt_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)295 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
296                                 uint8_t *RESTRICT dst, int32_t dst_stride,
297                                 const int8_t *filter, int32_t height) {
298   uint32_t loop_cnt;
299   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
300   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
301   v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
302   v16u8 out;
303   v8i16 filt, out10, out32;
304 
305   src -= (2 * src_stride);
306 
307   filt = LD_SH(filter);
308   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
309 
310   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
311   src += (5 * src_stride);
312 
313   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
314              src32_r, src43_r);
315   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
316   XORI_B2_128_SB(src2110, src4332);
317 
318   for (loop_cnt = (height >> 2); loop_cnt--;) {
319     LD_SB4(src, src_stride, src5, src6, src7, src8);
320     src += (4 * src_stride);
321 
322     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
323                src76_r, src87_r);
324     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
325     XORI_B2_128_SB(src6554, src8776);
326     out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
327     out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
328     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
329     SAT_SH2_SH(out10, out32, 7);
330     out = PCKEV_XORI128_UB(out10, out32);
331     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
332     dst += (4 * dst_stride);
333 
334     src2110 = src6554;
335     src4332 = src8776;
336     src4 = src8;
337   }
338 }
339 
common_vt_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)340 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
341                                 uint8_t *RESTRICT dst, int32_t dst_stride,
342                                 const int8_t *filter, int32_t height) {
343   uint32_t loop_cnt;
344   v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
345   v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
346   v16i8 src109_r, filt0, filt1, filt2;
347   v16u8 tmp0, tmp1;
348   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
349 
350   src -= (2 * src_stride);
351 
352   filt = LD_SH(filter);
353   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
354 
355   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
356   src += (5 * src_stride);
357 
358   XORI_B5_128_SB(src0, src1, src2, src3, src4);
359   ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
360              src21_r, src43_r);
361 
362   for (loop_cnt = (height >> 2); loop_cnt--;) {
363     LD_SB4(src, src_stride, src7, src8, src9, src10);
364     XORI_B4_128_SB(src7, src8, src9, src10);
365     src += (4 * src_stride);
366 
367     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
368                src87_r, src98_r, src109_r);
369     out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
370     out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
371     out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
372     out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
373     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
374     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
375     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
376     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
377     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
378     dst += (4 * dst_stride);
379 
380     src10_r = src76_r;
381     src32_r = src98_r;
382     src21_r = src87_r;
383     src43_r = src109_r;
384     src4 = src10;
385   }
386 }
387 
common_vt_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)388 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
389                                  uint8_t *RESTRICT dst, int32_t dst_stride,
390                                  const int8_t *filter, int32_t height) {
391   uint32_t loop_cnt;
392   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
393   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
394   v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
395   v16i8 src65_l, src87_l, filt0, filt1, filt2;
396   v16u8 tmp0, tmp1, tmp2, tmp3;
397   v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
398 
399   src -= (2 * src_stride);
400 
401   filt = LD_SH(filter);
402   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
403 
404   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
405   src += (5 * src_stride);
406 
407   XORI_B5_128_SB(src0, src1, src2, src3, src4);
408   ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
409              src43_r, src21_r);
410   ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
411              src43_l, src21_l);
412 
413   for (loop_cnt = (height >> 2); loop_cnt--;) {
414     LD_SB4(src, src_stride, src5, src6, src7, src8);
415     src += (4 * src_stride);
416 
417     XORI_B4_128_SB(src5, src6, src7, src8);
418     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
419                src76_r, src87_r);
420     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
421                src76_l, src87_l);
422     out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
423     out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
424     out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
425     out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
426     out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
427     out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
428     out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
429     out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
430     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
431     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
432     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
433     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
434     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
435                 tmp0, tmp1, tmp2, tmp3);
436     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
437     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
438     dst += (4 * dst_stride);
439 
440     src10_r = src54_r;
441     src32_r = src76_r;
442     src21_r = src65_r;
443     src43_r = src87_r;
444     src10_l = src54_l;
445     src32_l = src76_l;
446     src21_l = src65_l;
447     src43_l = src87_l;
448     src4 = src8;
449   }
450 }
451 
common_hv_6ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)452 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
453                                      uint8_t *RESTRICT dst, int32_t dst_stride,
454                                      const int8_t *filter_horiz,
455                                      const int8_t *filter_vert,
456                                      int32_t height) {
457   uint32_t loop_cnt;
458   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
459   v16i8 filt_hz0, filt_hz1, filt_hz2;
460   v16u8 mask0, mask1, mask2, out;
461   v8i16 tmp0, tmp1;
462   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
463   v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
464 
465   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
466   src -= (2 + 2 * src_stride);
467 
468   filt = LD_SH(filter_horiz);
469   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
470   filt = LD_SH(filter_vert);
471   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
472 
473   mask1 = mask0 + 2;
474   mask2 = mask0 + 4;
475 
476   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
477   src += (5 * src_stride);
478 
479   XORI_B5_128_SB(src0, src1, src2, src3, src4);
480   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
481                             filt_hz2);
482   hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
483                             filt_hz2);
484   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
485   hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
486                             filt_hz2);
487   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
488 
489   for (loop_cnt = (height >> 2); loop_cnt--;) {
490     LD_SB2(src, src_stride, src5, src6);
491     src += (2 * src_stride);
492 
493     XORI_B2_128_SB(src5, src6);
494     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
495                               filt_hz1, filt_hz2);
496     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
497 
498     LD_SB2(src, src_stride, src7, src8);
499     src += (2 * src_stride);
500 
501     XORI_B2_128_SB(src7, src8);
502     hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
503                               filt_hz1, filt_hz2);
504     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
505 
506     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
507     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
508 
509     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
510     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
511 
512     SRARI_H2_SH(tmp0, tmp1, 7);
513     SAT_SH2_SH(tmp0, tmp1, 7);
514     out = PCKEV_XORI128_UB(tmp0, tmp1);
515     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
516     dst += (4 * dst_stride);
517 
518     hz_out3 = hz_out7;
519     out0 = out2;
520     out1 = out3;
521   }
522 }
523 
common_hv_6ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)524 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
525                                      uint8_t *RESTRICT dst, int32_t dst_stride,
526                                      const int8_t *filter_horiz,
527                                      const int8_t *filter_vert,
528                                      int32_t height) {
529   uint32_t loop_cnt;
530   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
531   v16i8 filt_hz0, filt_hz1, filt_hz2;
532   v16u8 mask0, mask1, mask2, vec0, vec1;
533   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
534   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
535   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
536   v8i16 tmp0, tmp1, tmp2, tmp3;
537 
538   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
539   src -= (2 + 2 * src_stride);
540 
541   filt = LD_SH(filter_horiz);
542   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
543 
544   mask1 = mask0 + 2;
545   mask2 = mask0 + 4;
546 
547   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
548   src += (5 * src_stride);
549 
550   XORI_B5_128_SB(src0, src1, src2, src3, src4);
551   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
552                             filt_hz2);
553   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
554                             filt_hz2);
555   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
556                             filt_hz2);
557   hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
558                             filt_hz2);
559   hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
560                             filt_hz2);
561 
562   filt = LD_SH(filter_vert);
563   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
564 
565   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
566   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
567 
568   for (loop_cnt = (height >> 2); loop_cnt--;) {
569     LD_SB4(src, src_stride, src5, src6, src7, src8);
570     src += (4 * src_stride);
571 
572     XORI_B4_128_SB(src5, src6, src7, src8);
573     hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
574                               filt_hz1, filt_hz2);
575     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
576     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
577 
578     hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
579                               filt_hz1, filt_hz2);
580     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
581     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
582 
583     hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
584                               filt_hz1, filt_hz2);
585     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
586     tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
587 
588     hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
589                               filt_hz1, filt_hz2);
590     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
591     tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
592 
593     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
594     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
595     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
596     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
597     ST8x4_UB(vec0, vec1, dst, dst_stride);
598     dst += (4 * dst_stride);
599 
600     hz_out4 = hz_out8;
601     out0 = out2;
602     out1 = out7;
603     out3 = out5;
604     out4 = out6;
605   }
606 }
607 
common_hv_6ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)608 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
609                                       uint8_t *RESTRICT dst, int32_t dst_stride,
610                                       const int8_t *filter_horiz,
611                                       const int8_t *filter_vert,
612                                       int32_t height) {
613   int32_t multiple8_cnt;
614   for (multiple8_cnt = 2; multiple8_cnt--;) {
615     common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
616                              filter_vert, height);
617     src += 8;
618     dst += 8;
619   }
620 }
621 
common_hz_4t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)622 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
623                                  uint8_t *RESTRICT dst, int32_t dst_stride,
624                                  const int8_t *filter) {
625   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
626   v8i16 filt, out0, out1;
627   v16u8 out;
628 
629   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
630   src -= 1;
631 
632   filt = LD_SH(filter);
633   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
634 
635   mask1 = mask0 + 2;
636 
637   LD_SB4(src, src_stride, src0, src1, src2, src3);
638   XORI_B4_128_SB(src0, src1, src2, src3);
639   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
640                              out0, out1);
641   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
642   SAT_SH2_SH(out0, out1, 7);
643   out = PCKEV_XORI128_UB(out0, out1);
644   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
645 }
646 
common_hz_4t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)647 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
648                                  uint8_t *RESTRICT dst, int32_t dst_stride,
649                                  const int8_t *filter) {
650   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
651   v16u8 out;
652   v8i16 filt, out0, out1, out2, out3;
653 
654   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
655   src -= 1;
656 
657   filt = LD_SH(filter);
658   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
659 
660   mask1 = mask0 + 2;
661 
662   LD_SB4(src, src_stride, src0, src1, src2, src3);
663   src += (4 * src_stride);
664 
665   XORI_B4_128_SB(src0, src1, src2, src3);
666   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
667                              out0, out1);
668   LD_SB4(src, src_stride, src0, src1, src2, src3);
669   XORI_B4_128_SB(src0, src1, src2, src3);
670   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
671                              out2, out3);
672   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
673   SAT_SH4_SH(out0, out1, out2, out3, 7);
674   out = PCKEV_XORI128_UB(out0, out1);
675   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
676   dst += (4 * dst_stride);
677   out = PCKEV_XORI128_UB(out2, out3);
678   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
679 }
680 
common_hz_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)681 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
682                                 uint8_t *RESTRICT dst, int32_t dst_stride,
683                                 const int8_t *filter, int32_t height) {
684   if (4 == height) {
685     common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
686   } else if (8 == height) {
687     common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
688   }
689 }
690 
common_hz_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)691 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
692                                 uint8_t *RESTRICT dst, int32_t dst_stride,
693                                 const int8_t *filter, int32_t height) {
694   uint32_t loop_cnt;
695   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
696   v16u8 tmp0, tmp1;
697   v8i16 filt, out0, out1, out2, out3;
698 
699   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
700   src -= 1;
701 
702   filt = LD_SH(filter);
703   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
704 
705   mask1 = mask0 + 2;
706 
707   for (loop_cnt = (height >> 2); loop_cnt--;) {
708     LD_SB4(src, src_stride, src0, src1, src2, src3);
709     src += (4 * src_stride);
710 
711     XORI_B4_128_SB(src0, src1, src2, src3);
712     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
713                                filt1, out0, out1, out2, out3);
714     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
715     SAT_SH4_SH(out0, out1, out2, out3, 7);
716     tmp0 = PCKEV_XORI128_UB(out0, out1);
717     tmp1 = PCKEV_XORI128_UB(out2, out3);
718     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
719     dst += (4 * dst_stride);
720   }
721 }
722 
common_hz_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)723 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
724                                  uint8_t *RESTRICT dst, int32_t dst_stride,
725                                  const int8_t *filter, int32_t height) {
726   uint32_t loop_cnt;
727   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
728   v16i8 filt0, filt1, mask0, mask1;
729   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
730   v16u8 out;
731 
732   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
733   src -= 1;
734 
735   filt = LD_SH(filter);
736   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
737 
738   mask1 = mask0 + 2;
739 
740   for (loop_cnt = (height >> 2); loop_cnt--;) {
741     LD_SB4(src, src_stride, src0, src2, src4, src6);
742     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
743     src += (4 * src_stride);
744 
745     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
746     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
747                                filt1, out0, out1, out2, out3);
748     HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
749                                filt1, out4, out5, out6, out7);
750     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
751     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
752     SAT_SH4_SH(out0, out1, out2, out3, 7);
753     SAT_SH4_SH(out4, out5, out6, out7, 7);
754     out = PCKEV_XORI128_UB(out0, out1);
755     ST_UB(out, dst);
756     dst += dst_stride;
757     out = PCKEV_XORI128_UB(out2, out3);
758     ST_UB(out, dst);
759     dst += dst_stride;
760     out = PCKEV_XORI128_UB(out4, out5);
761     ST_UB(out, dst);
762     dst += dst_stride;
763     out = PCKEV_XORI128_UB(out6, out7);
764     ST_UB(out, dst);
765     dst += dst_stride;
766   }
767 }
768 
common_vt_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)769 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
770                                 uint8_t *RESTRICT dst, int32_t dst_stride,
771                                 const int8_t *filter, int32_t height) {
772   uint32_t loop_cnt;
773   v16i8 src0, src1, src2, src3, src4, src5;
774   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
775   v16i8 src2110, src4332, filt0, filt1;
776   v8i16 filt, out10, out32;
777   v16u8 out;
778 
779   src -= src_stride;
780 
781   filt = LD_SH(filter);
782   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
783 
784   LD_SB3(src, src_stride, src0, src1, src2);
785   src += (3 * src_stride);
786 
787   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
788 
789   src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
790   src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
791 
792   for (loop_cnt = (height >> 2); loop_cnt--;) {
793     LD_SB3(src, src_stride, src3, src4, src5);
794     src += (3 * src_stride);
795     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
796     src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
797     src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
798     out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
799 
800     src2 = LD_SB(src);
801     src += (src_stride);
802     ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
803     src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
804     src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
805     out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
806     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
807     SAT_SH2_SH(out10, out32, 7);
808     out = PCKEV_XORI128_UB(out10, out32);
809     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
810     dst += (4 * dst_stride);
811   }
812 }
813 
common_vt_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)814 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
815                                 uint8_t *RESTRICT dst, int32_t dst_stride,
816                                 const int8_t *filter, int32_t height) {
817   uint32_t loop_cnt;
818   v16i8 src0, src1, src2, src7, src8, src9, src10;
819   v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
820   v16u8 tmp0, tmp1;
821   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
822 
823   src -= src_stride;
824 
825   filt = LD_SH(filter);
826   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
827 
828   LD_SB3(src, src_stride, src0, src1, src2);
829   src += (3 * src_stride);
830 
831   XORI_B3_128_SB(src0, src1, src2);
832   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
833 
834   for (loop_cnt = (height >> 2); loop_cnt--;) {
835     LD_SB4(src, src_stride, src7, src8, src9, src10);
836     src += (4 * src_stride);
837 
838     XORI_B4_128_SB(src7, src8, src9, src10);
839     ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
840                src87_r, src98_r, src109_r);
841     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
842     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
843     out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
844     out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
845     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
846     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
847     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
848     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
849     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
850     dst += (4 * dst_stride);
851 
852     src10_r = src98_r;
853     src21_r = src109_r;
854     src2 = src10;
855   }
856 }
857 
common_vt_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)858 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
859                                  uint8_t *RESTRICT dst, int32_t dst_stride,
860                                  const int8_t *filter, int32_t height) {
861   uint32_t loop_cnt;
862   v16i8 src0, src1, src2, src3, src4, src5, src6;
863   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
864   v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
865   v16u8 tmp0, tmp1, tmp2, tmp3;
866   v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
867 
868   src -= src_stride;
869 
870   filt = LD_SH(filter);
871   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
872 
873   LD_SB3(src, src_stride, src0, src1, src2);
874   src += (3 * src_stride);
875 
876   XORI_B3_128_SB(src0, src1, src2);
877   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
878   ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
879 
880   for (loop_cnt = (height >> 2); loop_cnt--;) {
881     LD_SB4(src, src_stride, src3, src4, src5, src6);
882     src += (4 * src_stride);
883 
884     XORI_B4_128_SB(src3, src4, src5, src6);
885     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
886                src54_r, src65_r);
887     ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
888                src54_l, src65_l);
889     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
890     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
891     out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
892     out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
893     out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
894     out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
895     out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
896     out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
897     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
898     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
899     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
900     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
901     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
902                 tmp0, tmp1, tmp2, tmp3);
903     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
904     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
905     dst += (4 * dst_stride);
906 
907     src10_r = src54_r;
908     src21_r = src65_r;
909     src10_l = src54_l;
910     src21_l = src65_l;
911     src2 = src6;
912   }
913 }
914 
common_hv_4ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)915 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
916                                      uint8_t *RESTRICT dst, int32_t dst_stride,
917                                      const int8_t *filter_horiz,
918                                      const int8_t *filter_vert,
919                                      int32_t height) {
920   uint32_t loop_cnt;
921   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
922   v16u8 mask0, mask1, out;
923   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
924   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
925 
926   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
927   src -= (1 + 1 * src_stride);
928 
929   filt = LD_SH(filter_horiz);
930   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
931 
932   mask1 = mask0 + 2;
933 
934   LD_SB3(src, src_stride, src0, src1, src2);
935   src += (3 * src_stride);
936 
937   XORI_B3_128_SB(src0, src1, src2);
938   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
939   hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
940   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
941 
942   filt = LD_SH(filter_vert);
943   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
944 
945   for (loop_cnt = (height >> 2); loop_cnt--;) {
946     LD_SB4(src, src_stride, src3, src4, src5, src6);
947     src += (4 * src_stride);
948 
949     XORI_B2_128_SB(src3, src4);
950     hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
951     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
952     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
953     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
954 
955     XORI_B2_128_SB(src5, src6);
956     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
957     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
958     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
959     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
960 
961     SRARI_H2_SH(tmp0, tmp1, 7);
962     SAT_SH2_SH(tmp0, tmp1, 7);
963     out = PCKEV_XORI128_UB(tmp0, tmp1);
964     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
965     dst += (4 * dst_stride);
966 
967     hz_out1 = hz_out5;
968     vec0 = vec2;
969   }
970 }
971 
common_hv_4ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)972 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
973                                      uint8_t *RESTRICT dst, int32_t dst_stride,
974                                      const int8_t *filter_horiz,
975                                      const int8_t *filter_vert,
976                                      int32_t height) {
977   uint32_t loop_cnt;
978   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
979   v16u8 mask0, mask1, out0, out1;
980   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
981   v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
982   v8i16 vec0, vec1, vec2, vec3, vec4;
983 
984   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
985   src -= (1 + 1 * src_stride);
986 
987   filt = LD_SH(filter_horiz);
988   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
989 
990   mask1 = mask0 + 2;
991 
992   LD_SB3(src, src_stride, src0, src1, src2);
993   src += (3 * src_stride);
994 
995   XORI_B3_128_SB(src0, src1, src2);
996   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
997   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
998   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
999   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1000 
1001   filt = LD_SH(filter_vert);
1002   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1003 
1004   for (loop_cnt = (height >> 2); loop_cnt--;) {
1005     LD_SB4(src, src_stride, src3, src4, src5, src6);
1006     src += (4 * src_stride);
1007 
1008     XORI_B4_128_SB(src3, src4, src5, src6);
1009     hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1010     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1011     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1012 
1013     hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1014     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1015     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1016 
1017     hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1018     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1019     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1020 
1021     hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1022     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1023     tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1024 
1025     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1026     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1027     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1028     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1029     ST8x4_UB(out0, out1, dst, dst_stride);
1030     dst += (4 * dst_stride);
1031 
1032     vec0 = vec4;
1033     vec2 = vec1;
1034   }
1035 }
1036 
common_hv_4ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1037 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1038                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1039                                       const int8_t *filter_horiz,
1040                                       const int8_t *filter_vert,
1041                                       int32_t height) {
1042   int32_t multiple8_cnt;
1043   for (multiple8_cnt = 2; multiple8_cnt--;) {
1044     common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1045                              filter_vert, height);
1046     src += 8;
1047     dst += 8;
1048   }
1049 }
1050 
common_hv_6ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1051 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1052                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1053                                      const int8_t *filter_horiz,
1054                                      const int8_t *filter_vert,
1055                                      int32_t height) {
1056   uint32_t loop_cnt;
1057   v16i8 src0, src1, src2, src3, src4, src5, src6;
1058   v16i8 filt_hz0, filt_hz1, filt_hz2;
1059   v16u8 res0, res1, mask0, mask1, mask2;
1060   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1061   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1062 
1063   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
1064   src -= (2 + 1 * src_stride);
1065 
1066   filt = LD_SH(filter_horiz);
1067   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1068 
1069   mask1 = mask0 + 2;
1070   mask2 = mask0 + 4;
1071 
1072   LD_SB3(src, src_stride, src0, src1, src2);
1073   src += (3 * src_stride);
1074 
1075   XORI_B3_128_SB(src0, src1, src2);
1076   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1077                             filt_hz2);
1078   hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1079                             filt_hz2);
1080   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1081 
1082   filt = LD_SH(filter_vert);
1083   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1084 
1085   for (loop_cnt = (height >> 2); loop_cnt--;) {
1086     LD_SB4(src, src_stride, src3, src4, src5, src6);
1087     src += (4 * src_stride);
1088 
1089     XORI_B4_128_SB(src3, src4, src5, src6);
1090     hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1091                               filt_hz1, filt_hz2);
1092     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
1093     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1094     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1095 
1096     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1097                               filt_hz1, filt_hz2);
1098     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1099     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1100     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1101 
1102     SRARI_H2_SH(tmp0, tmp1, 7);
1103     SAT_SH2_SH(tmp0, tmp1, 7);
1104     PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1105     XORI_B2_128_UB(res0, res1);
1106     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1107     dst += (4 * dst_stride);
1108 
1109     hz_out1 = hz_out5;
1110     vec0 = vec2;
1111   }
1112 }
1113 
common_hv_6ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1114 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1115                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1116                                      const int8_t *filter_horiz,
1117                                      const int8_t *filter_vert,
1118                                      int32_t height) {
1119   uint32_t loop_cnt;
1120   v16i8 src0, src1, src2, src3, src4, src5, src6;
1121   v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1122   v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1123   v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1124   v16u8 out0, out1;
1125 
1126   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1127   src -= (2 + src_stride);
1128 
1129   filt = LD_SH(filter_horiz);
1130   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1131 
1132   mask1 = mask0 + 2;
1133   mask2 = mask0 + 4;
1134 
1135   LD_SB3(src, src_stride, src0, src1, src2);
1136   src += (3 * src_stride);
1137 
1138   XORI_B3_128_SB(src0, src1, src2);
1139   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1140                             filt_hz2);
1141   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1142                             filt_hz2);
1143   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1144                             filt_hz2);
1145   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1146 
1147   filt = LD_SH(filter_vert);
1148   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1149 
1150   for (loop_cnt = (height >> 2); loop_cnt--;) {
1151     LD_SB4(src, src_stride, src3, src4, src5, src6);
1152     src += (4 * src_stride);
1153 
1154     XORI_B4_128_SB(src3, src4, src5, src6);
1155 
1156     hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1157                               filt_hz1, filt_hz2);
1158     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1159     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1160 
1161     hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1162                               filt_hz1, filt_hz2);
1163     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1164     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1165 
1166     hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1167                               filt_hz1, filt_hz2);
1168     vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1169     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1170 
1171     hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1172                               filt_hz1, filt_hz2);
1173     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1174     tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1175 
1176     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1177     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1178     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1179     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1180     ST8x4_UB(out0, out1, dst, dst_stride);
1181     dst += (4 * dst_stride);
1182   }
1183 }
1184 
common_hv_6ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1185 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1186                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1187                                       const int8_t *filter_horiz,
1188                                       const int8_t *filter_vert,
1189                                       int32_t height) {
1190   int32_t multiple8_cnt;
1191   for (multiple8_cnt = 2; multiple8_cnt--;) {
1192     common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1193                              filter_vert, height);
1194     src += 8;
1195     dst += 8;
1196   }
1197 }
1198 
common_hv_4ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1199 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1200                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1201                                      const int8_t *filter_horiz,
1202                                      const int8_t *filter_vert,
1203                                      int32_t height) {
1204   uint32_t loop_cnt;
1205   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1206   v16i8 filt_hz0, filt_hz1, mask0, mask1;
1207   v16u8 out;
1208   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1209   v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1210   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1211 
1212   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
1213 
1214   src -= (1 + 2 * src_stride);
1215 
1216   filt = LD_SH(filter_horiz);
1217   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1218 
1219   mask1 = mask0 + 2;
1220 
1221   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1222   src += (5 * src_stride);
1223 
1224   XORI_B5_128_SB(src0, src1, src2, src3, src4);
1225   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1226   hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1227   hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1228   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1229   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1230 
1231   filt = LD_SH(filter_vert);
1232   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1233 
1234   for (loop_cnt = (height >> 2); loop_cnt--;) {
1235     LD_SB4(src, src_stride, src5, src6, src7, src8);
1236     XORI_B4_128_SB(src5, src6, src7, src8);
1237     src += (4 * src_stride);
1238 
1239     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1240     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1241     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1242     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1243 
1244     hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1245     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
1246     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1247     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1248 
1249     SRARI_H2_SH(tmp0, tmp1, 7);
1250     SAT_SH2_SH(tmp0, tmp1, 7);
1251     out = PCKEV_XORI128_UB(tmp0, tmp1);
1252     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1253     dst += (4 * dst_stride);
1254 
1255     hz_out3 = hz_out7;
1256     out0 = out2;
1257     out1 = out3;
1258   }
1259 }
1260 
common_hv_4ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1261 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1262                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1263                                      const int8_t *filter_horiz,
1264                                      const int8_t *filter_vert,
1265                                      int32_t height) {
1266   uint32_t loop_cnt;
1267   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1268   v16i8 filt_hz0, filt_hz1, mask0, mask1;
1269   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1270   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1271   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1272   v16u8 vec0, vec1;
1273 
1274   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1275   src -= (1 + 2 * src_stride);
1276 
1277   filt = LD_SH(filter_horiz);
1278   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1279 
1280   mask1 = mask0 + 2;
1281 
1282   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1283   src += (5 * src_stride);
1284 
1285   XORI_B5_128_SB(src0, src1, src2, src3, src4);
1286   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1287   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1288   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1289   hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1290   hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1291   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1292   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1293 
1294   filt = LD_SH(filter_vert);
1295   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1296 
1297   for (loop_cnt = (height >> 2); loop_cnt--;) {
1298     LD_SB4(src, src_stride, src5, src6, src7, src8);
1299     src += (4 * src_stride);
1300 
1301     XORI_B4_128_SB(src5, src6, src7, src8);
1302 
1303     hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1304     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1305     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1306 
1307     hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1308     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
1309     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1310 
1311     hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1312     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1313     tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1314 
1315     hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1316     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
1317     tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1318 
1319     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1320     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1321     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1322     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1323     ST8x4_UB(vec0, vec1, dst, dst_stride);
1324     dst += (4 * dst_stride);
1325 
1326     hz_out4 = hz_out8;
1327     out0 = out2;
1328     out1 = out6;
1329     out3 = out5;
1330     out4 = out7;
1331   }
1332 }
1333 
common_hv_4ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1334 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1335                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1336                                       const int8_t *filter_horiz,
1337                                       const int8_t *filter_vert,
1338                                       int32_t height) {
1339   int32_t multiple8_cnt;
1340   for (multiple8_cnt = 2; multiple8_cnt--;) {
1341     common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1342                              filter_vert, height);
1343     src += 8;
1344     dst += 8;
1345   }
1346 }
1347 
vp8_sixtap_predict4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1348 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1349                                int32_t xoffset, int32_t yoffset,
1350                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1351   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1352   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1353 
1354   if (yoffset) {
1355     if (xoffset) {
1356       switch (xoffset) {
1357         case 2:
1358         case 4:
1359         case 6:
1360           switch (yoffset) {
1361             case 2:
1362             case 4:
1363             case 6:
1364               common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1365                                        h_filter, v_filter, 4);
1366               break;
1367 
1368             case 1:
1369             case 3:
1370             case 5:
1371             case 7:
1372               common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1373                                        h_filter, v_filter + 1, 4);
1374               break;
1375           }
1376           break;
1377 
1378         case 1:
1379         case 3:
1380         case 5:
1381         case 7:
1382           switch (yoffset) {
1383             case 2:
1384             case 4:
1385             case 6:
1386               common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1387                                        h_filter + 1, v_filter, 4);
1388               break;
1389 
1390             case 1:
1391             case 3:
1392             case 5:
1393             case 7:
1394               common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1395                                        h_filter + 1, v_filter + 1, 4);
1396               break;
1397           }
1398           break;
1399       }
1400     } else {
1401       switch (yoffset) {
1402         case 2:
1403         case 4:
1404         case 6:
1405           common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1406           break;
1407 
1408         case 1:
1409         case 3:
1410         case 5:
1411         case 7:
1412           common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1413                               4);
1414           break;
1415       }
1416     }
1417   } else {
1418     switch (xoffset) {
1419       case 0: {
1420         uint32_t tp0, tp1, tp2, tp3;
1421 
1422         LW4(src, src_stride, tp0, tp1, tp2, tp3);
1423         SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
1424         break;
1425       }
1426       case 2:
1427       case 4:
1428       case 6:
1429         common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1430         break;
1431 
1432       case 1:
1433       case 3:
1434       case 5:
1435       case 7:
1436         common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1437         break;
1438     }
1439   }
1440 }
1441 
vp8_sixtap_predict8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1442 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1443                                int32_t xoffset, int32_t yoffset,
1444                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1445   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1446   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1447 
1448   if (yoffset) {
1449     if (xoffset) {
1450       switch (xoffset) {
1451         case 2:
1452         case 4:
1453         case 6:
1454           switch (yoffset) {
1455             case 2:
1456             case 4:
1457             case 6:
1458               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1459                                        h_filter, v_filter, 4);
1460               break;
1461 
1462             case 1:
1463             case 3:
1464             case 5:
1465             case 7:
1466               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1467                                        h_filter, v_filter + 1, 4);
1468               break;
1469           }
1470           break;
1471 
1472         case 1:
1473         case 3:
1474         case 5:
1475         case 7:
1476           switch (yoffset) {
1477             case 2:
1478             case 4:
1479             case 6:
1480               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1481                                        h_filter + 1, v_filter, 4);
1482               break;
1483 
1484             case 1:
1485             case 3:
1486             case 5:
1487             case 7:
1488               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1489                                        h_filter + 1, v_filter + 1, 4);
1490               break;
1491           }
1492           break;
1493       }
1494     } else {
1495       switch (yoffset) {
1496         case 2:
1497         case 4:
1498         case 6:
1499           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1500           break;
1501 
1502         case 1:
1503         case 3:
1504         case 5:
1505         case 7:
1506           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1507                               4);
1508           break;
1509       }
1510     }
1511   } else {
1512     switch (xoffset) {
1513       case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
1514       case 2:
1515       case 4:
1516       case 6:
1517         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1518         break;
1519 
1520       case 1:
1521       case 3:
1522       case 5:
1523       case 7:
1524         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1525         break;
1526     }
1527   }
1528 }
1529 
vp8_sixtap_predict8x8_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1530 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
1531                                int32_t xoffset, int32_t yoffset,
1532                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1533   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1534   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1535 
1536   if (yoffset) {
1537     if (xoffset) {
1538       switch (xoffset) {
1539         case 2:
1540         case 4:
1541         case 6:
1542           switch (yoffset) {
1543             case 2:
1544             case 4:
1545             case 6:
1546               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1547                                        h_filter, v_filter, 8);
1548               break;
1549 
1550             case 1:
1551             case 3:
1552             case 5:
1553             case 7:
1554               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1555                                        h_filter, v_filter + 1, 8);
1556               break;
1557           }
1558           break;
1559 
1560         case 1:
1561         case 3:
1562         case 5:
1563         case 7:
1564           switch (yoffset) {
1565             case 2:
1566             case 4:
1567             case 6:
1568               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1569                                        h_filter + 1, v_filter, 8);
1570               break;
1571 
1572             case 1:
1573             case 3:
1574             case 5:
1575             case 7:
1576               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1577                                        h_filter + 1, v_filter + 1, 8);
1578               break;
1579           }
1580           break;
1581       }
1582     } else {
1583       switch (yoffset) {
1584         case 2:
1585         case 4:
1586         case 6:
1587           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
1588           break;
1589 
1590         case 1:
1591         case 3:
1592         case 5:
1593         case 7:
1594           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1595                               8);
1596           break;
1597       }
1598     }
1599   } else {
1600     switch (xoffset) {
1601       case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1602       case 2:
1603       case 4:
1604       case 6:
1605         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
1606         break;
1607 
1608       case 1:
1609       case 3:
1610       case 5:
1611       case 7:
1612         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
1613         break;
1614     }
1615   }
1616 }
1617 
vp8_sixtap_predict16x16_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1618 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
1619                                  int32_t xoffset, int32_t yoffset,
1620                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
1621   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1622   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1623 
1624   if (yoffset) {
1625     if (xoffset) {
1626       switch (xoffset) {
1627         case 2:
1628         case 4:
1629         case 6:
1630           switch (yoffset) {
1631             case 2:
1632             case 4:
1633             case 6:
1634               common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1635                                         h_filter, v_filter, 16);
1636               break;
1637 
1638             case 1:
1639             case 3:
1640             case 5:
1641             case 7:
1642               common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1643                                         h_filter, v_filter + 1, 16);
1644               break;
1645           }
1646           break;
1647 
1648         case 1:
1649         case 3:
1650         case 5:
1651         case 7:
1652           switch (yoffset) {
1653             case 2:
1654             case 4:
1655             case 6:
1656               common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1657                                         h_filter + 1, v_filter, 16);
1658               break;
1659 
1660             case 1:
1661             case 3:
1662             case 5:
1663             case 7:
1664               common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1665                                         h_filter + 1, v_filter + 1, 16);
1666               break;
1667           }
1668           break;
1669       }
1670     } else {
1671       switch (yoffset) {
1672         case 2:
1673         case 4:
1674         case 6:
1675           common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
1676           break;
1677 
1678         case 1:
1679         case 3:
1680         case 5:
1681         case 7:
1682           common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1683                                16);
1684           break;
1685       }
1686     }
1687   } else {
1688     switch (xoffset) {
1689       case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1690       case 2:
1691       case 4:
1692       case 6:
1693         common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
1694         break;
1695 
1696       case 1:
1697       case 3:
1698       case 5:
1699       case 7:
1700         common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
1701                              16);
1702         break;
1703     }
1704   }
1705 }
1706