1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14 #include "vpx_dsp/variance.h"
15 
16 static const uint8_t bilinear_filters_msa[8][2] = {
17   { 128,   0, },
18   { 112,  16, },
19   {  96,  32, },
20   {  80,  48, },
21   {  64,  64, },
22   {  48,  80, },
23   {  32,  96, },
24   {  16, 112, },
25 };
26 
27 #define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
28   v16u8 src_l0_m, src_l1_m;                                        \
29   v8i16 res_l0_m, res_l1_m;                                        \
30                                                                    \
31   ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
32   HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
33   DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
34                                                                    \
35   sub += res_l0_m + res_l1_m;                                      \
36 }
37 
38 #define VARIANCE_WxH(sse, diff, shift) \
39   sse - (((uint32_t)diff * diff) >> shift)
40 
41 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
42   sse - (((int64_t)diff * diff) >> shift)
43 
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)44 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
45                                         int32_t src_stride,
46                                         const uint8_t *ref_ptr,
47                                         int32_t ref_stride,
48                                         const uint8_t *sec_pred,
49                                         int32_t height,
50                                         int32_t *diff) {
51   int32_t ht_cnt;
52   uint32_t src0, src1, src2, src3;
53   uint32_t ref0, ref1, ref2, ref3;
54   v16u8 pred, src = { 0 };
55   v16u8 ref = { 0 };
56   v8i16 avg = { 0 };
57   v4i32 vec, var = { 0 };
58 
59   for (ht_cnt = (height >> 2); ht_cnt--;) {
60     pred = LD_UB(sec_pred);
61     sec_pred += 16;
62     LW4(src_ptr, src_stride, src0, src1, src2, src3);
63     src_ptr += (4 * src_stride);
64     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
65     ref_ptr += (4 * ref_stride);
66 
67     INSERT_W4_UB(src0, src1, src2, src3, src);
68     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
69 
70     src = __msa_aver_u_b(src, pred);
71     CALC_MSE_AVG_B(src, ref, var, avg);
72   }
73 
74   vec = __msa_hadd_s_w(avg, avg);
75   *diff = HADD_SW_S32(vec);
76 
77   return HADD_SW_S32(var);
78 }
79 
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)80 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
81                                         int32_t src_stride,
82                                         const uint8_t *ref_ptr,
83                                         int32_t ref_stride,
84                                         const uint8_t *sec_pred,
85                                         int32_t height,
86                                         int32_t *diff) {
87   int32_t ht_cnt;
88   v16u8 src0, src1, src2, src3;
89   v16u8 ref0, ref1, ref2, ref3;
90   v16u8 pred0, pred1;
91   v8i16 avg = { 0 };
92   v4i32 vec, var = { 0 };
93 
94   for (ht_cnt = (height >> 2); ht_cnt--;) {
95     LD_UB2(sec_pred, 16, pred0, pred1);
96     sec_pred += 32;
97     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
98     src_ptr += (4 * src_stride);
99     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
100     ref_ptr += (4 * ref_stride);
101 
102     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
103                 src0, src1, ref0, ref1);
104     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
105     CALC_MSE_AVG_B(src0, ref0, var, avg);
106     CALC_MSE_AVG_B(src1, ref1, var, avg);
107   }
108 
109   vec = __msa_hadd_s_w(avg, avg);
110   *diff = HADD_SW_S32(vec);
111 
112   return HADD_SW_S32(var);
113 }
114 
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)115 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
116                                          int32_t src_stride,
117                                          const uint8_t *ref_ptr,
118                                          int32_t ref_stride,
119                                          const uint8_t *sec_pred,
120                                          int32_t height,
121                                          int32_t *diff) {
122   int32_t ht_cnt;
123   v16u8 src, ref, pred;
124   v8i16 avg = { 0 };
125   v4i32 vec, var = { 0 };
126 
127   for (ht_cnt = (height >> 2); ht_cnt--;) {
128     pred = LD_UB(sec_pred);
129     sec_pred += 16;
130     src = LD_UB(src_ptr);
131     src_ptr += src_stride;
132     ref = LD_UB(ref_ptr);
133     ref_ptr += ref_stride;
134     src = __msa_aver_u_b(src, pred);
135     CALC_MSE_AVG_B(src, ref, var, avg);
136 
137     pred = LD_UB(sec_pred);
138     sec_pred += 16;
139     src = LD_UB(src_ptr);
140     src_ptr += src_stride;
141     ref = LD_UB(ref_ptr);
142     ref_ptr += ref_stride;
143     src = __msa_aver_u_b(src, pred);
144     CALC_MSE_AVG_B(src, ref, var, avg);
145 
146     pred = LD_UB(sec_pred);
147     sec_pred += 16;
148     src = LD_UB(src_ptr);
149     src_ptr += src_stride;
150     ref = LD_UB(ref_ptr);
151     ref_ptr += ref_stride;
152     src = __msa_aver_u_b(src, pred);
153     CALC_MSE_AVG_B(src, ref, var, avg);
154 
155     pred = LD_UB(sec_pred);
156     sec_pred += 16;
157     src = LD_UB(src_ptr);
158     src_ptr += src_stride;
159     ref = LD_UB(ref_ptr);
160     ref_ptr += ref_stride;
161     src = __msa_aver_u_b(src, pred);
162     CALC_MSE_AVG_B(src, ref, var, avg);
163   }
164 
165   vec = __msa_hadd_s_w(avg, avg);
166   *diff = HADD_SW_S32(vec);
167 
168   return HADD_SW_S32(var);
169 }
170 
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)171 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
172                                          int32_t src_stride,
173                                          const uint8_t *ref_ptr,
174                                          int32_t ref_stride,
175                                          const uint8_t *sec_pred,
176                                          int32_t height,
177                                          int32_t *diff) {
178   int32_t ht_cnt;
179   v16u8 src0, src1, ref0, ref1, pred0, pred1;
180   v8i16 avg = { 0 };
181   v4i32 vec, var = { 0 };
182 
183   for (ht_cnt = (height >> 2); ht_cnt--;) {
184     LD_UB2(sec_pred, 16, pred0, pred1);
185     sec_pred += 32;
186     LD_UB2(src_ptr, 16, src0, src1);
187     src_ptr += src_stride;
188     LD_UB2(ref_ptr, 16, ref0, ref1);
189     ref_ptr += ref_stride;
190     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
191     CALC_MSE_AVG_B(src0, ref0, var, avg);
192     CALC_MSE_AVG_B(src1, ref1, var, avg);
193 
194     LD_UB2(sec_pred, 16, pred0, pred1);
195     sec_pred += 32;
196     LD_UB2(src_ptr, 16, src0, src1);
197     src_ptr += src_stride;
198     LD_UB2(ref_ptr, 16, ref0, ref1);
199     ref_ptr += ref_stride;
200     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
201     CALC_MSE_AVG_B(src0, ref0, var, avg);
202     CALC_MSE_AVG_B(src1, ref1, var, avg);
203 
204     LD_UB2(sec_pred, 16, pred0, pred1);
205     sec_pred += 32;
206     LD_UB2(src_ptr, 16, src0, src1);
207     src_ptr += src_stride;
208     LD_UB2(ref_ptr, 16, ref0, ref1);
209     ref_ptr += ref_stride;
210     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
211     CALC_MSE_AVG_B(src0, ref0, var, avg);
212     CALC_MSE_AVG_B(src1, ref1, var, avg);
213 
214     LD_UB2(sec_pred, 16, pred0, pred1);
215     sec_pred += 32;
216     LD_UB2(src_ptr, 16, src0, src1);
217     src_ptr += src_stride;
218     LD_UB2(ref_ptr, 16, ref0, ref1);
219     ref_ptr += ref_stride;
220     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
221     CALC_MSE_AVG_B(src0, ref0, var, avg);
222     CALC_MSE_AVG_B(src1, ref1, var, avg);
223   }
224 
225   vec = __msa_hadd_s_w(avg, avg);
226   *diff = HADD_SW_S32(vec);
227 
228   return HADD_SW_S32(var);
229 }
230 
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)231 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
232                                        int32_t src_stride,
233                                        const uint8_t *ref_ptr,
234                                        int32_t ref_stride,
235                                        const uint8_t *sec_pred,
236                                        int32_t *diff) {
237   int32_t ht_cnt;
238   v16u8 src0, src1, ref0, ref1, pred0, pred1;
239   v8i16 avg0 = { 0 };
240   v8i16 avg1 = { 0 };
241   v4i32 vec, var = { 0 };
242 
243   for (ht_cnt = 16; ht_cnt--;) {
244     LD_UB2(sec_pred, 16, pred0, pred1);
245     sec_pred += 32;
246     LD_UB2(src_ptr, 16, src0, src1);
247     src_ptr += src_stride;
248     LD_UB2(ref_ptr, 16, ref0, ref1);
249     ref_ptr += ref_stride;
250     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
251     CALC_MSE_AVG_B(src0, ref0, var, avg0);
252     CALC_MSE_AVG_B(src1, ref1, var, avg1);
253 
254     LD_UB2(sec_pred, 16, pred0, pred1);
255     sec_pred += 32;
256     LD_UB2(src_ptr, 16, src0, src1);
257     src_ptr += src_stride;
258     LD_UB2(ref_ptr, 16, ref0, ref1);
259     ref_ptr += ref_stride;
260     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
261     CALC_MSE_AVG_B(src0, ref0, var, avg0);
262     CALC_MSE_AVG_B(src1, ref1, var, avg1);
263 
264     LD_UB2(sec_pred, 16, pred0, pred1);
265     sec_pred += 32;
266     LD_UB2(src_ptr, 16, src0, src1);
267     src_ptr += src_stride;
268     LD_UB2(ref_ptr, 16, ref0, ref1);
269     ref_ptr += ref_stride;
270     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
271     CALC_MSE_AVG_B(src0, ref0, var, avg0);
272     CALC_MSE_AVG_B(src1, ref1, var, avg1);
273 
274     LD_UB2(sec_pred, 16, pred0, pred1);
275     sec_pred += 32;
276     LD_UB2(src_ptr, 16, src0, src1);
277     src_ptr += src_stride;
278     LD_UB2(ref_ptr, 16, ref0, ref1);
279     ref_ptr += ref_stride;
280     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
281     CALC_MSE_AVG_B(src0, ref0, var, avg0);
282     CALC_MSE_AVG_B(src1, ref1, var, avg1);
283   }
284 
285   vec = __msa_hadd_s_w(avg0, avg0);
286   vec += __msa_hadd_s_w(avg1, avg1);
287   *diff = HADD_SW_S32(vec);
288 
289   return HADD_SW_S32(var);
290 }
291 
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)292 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
293                                        int32_t src_stride,
294                                        const uint8_t *ref_ptr,
295                                        int32_t ref_stride,
296                                        const uint8_t *sec_pred,
297                                        int32_t *diff) {
298   int32_t ht_cnt;
299   v16u8 src0, src1, src2, src3;
300   v16u8 ref0, ref1, ref2, ref3;
301   v16u8 pred0, pred1, pred2, pred3;
302   v8i16 avg0 = { 0 };
303   v8i16 avg1 = { 0 };
304   v4i32 vec, var = { 0 };
305 
306   for (ht_cnt = 16; ht_cnt--;) {
307     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
308     sec_pred += 64;
309     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
310     src_ptr += src_stride;
311     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
312     ref_ptr += ref_stride;
313     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
314                 src0, src1, src2, src3);
315     CALC_MSE_AVG_B(src0, ref0, var, avg0);
316     CALC_MSE_AVG_B(src2, ref2, var, avg0);
317     CALC_MSE_AVG_B(src1, ref1, var, avg1);
318     CALC_MSE_AVG_B(src3, ref3, var, avg1);
319 
320     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
321     sec_pred += 64;
322     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
323     src_ptr += src_stride;
324     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
325     ref_ptr += ref_stride;
326     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
327                 src0, src1, src2, src3);
328     CALC_MSE_AVG_B(src0, ref0, var, avg0);
329     CALC_MSE_AVG_B(src2, ref2, var, avg0);
330     CALC_MSE_AVG_B(src1, ref1, var, avg1);
331     CALC_MSE_AVG_B(src3, ref3, var, avg1);
332   }
333 
334   vec = __msa_hadd_s_w(avg0, avg0);
335   vec += __msa_hadd_s_w(avg1, avg1);
336 
337   *diff = HADD_SW_S32(vec);
338 
339   return HADD_SW_S32(var);
340 }
341 
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)342 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
343                                        int32_t src_stride,
344                                        const uint8_t *ref_ptr,
345                                        int32_t ref_stride,
346                                        const uint8_t *sec_pred,
347                                        int32_t *diff) {
348   int32_t ht_cnt;
349   v16u8 src0, src1, src2, src3;
350   v16u8 ref0, ref1, ref2, ref3;
351   v16u8 pred0, pred1, pred2, pred3;
352   v8i16 avg0 = { 0 };
353   v8i16 avg1 = { 0 };
354   v8i16 avg2 = { 0 };
355   v8i16 avg3 = { 0 };
356   v4i32 vec, var = { 0 };
357 
358   for (ht_cnt = 32; ht_cnt--;) {
359     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
360     sec_pred += 64;
361     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
362     src_ptr += src_stride;
363     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
364     ref_ptr += ref_stride;
365     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
366                 src0, src1, src2, src3);
367     CALC_MSE_AVG_B(src0, ref0, var, avg0);
368     CALC_MSE_AVG_B(src1, ref1, var, avg1);
369     CALC_MSE_AVG_B(src2, ref2, var, avg2);
370     CALC_MSE_AVG_B(src3, ref3, var, avg3);
371 
372     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
373     sec_pred += 64;
374     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
375     src_ptr += src_stride;
376     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
377     ref_ptr += ref_stride;
378     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
379                 src0, src1, src2, src3);
380     CALC_MSE_AVG_B(src0, ref0, var, avg0);
381     CALC_MSE_AVG_B(src1, ref1, var, avg1);
382     CALC_MSE_AVG_B(src2, ref2, var, avg2);
383     CALC_MSE_AVG_B(src3, ref3, var, avg3);
384   }
385 
386   vec = __msa_hadd_s_w(avg0, avg0);
387   vec += __msa_hadd_s_w(avg1, avg1);
388   vec += __msa_hadd_s_w(avg2, avg2);
389   vec += __msa_hadd_s_w(avg3, avg3);
390   *diff = HADD_SW_S32(vec);
391 
392   return HADD_SW_S32(var);
393 }
394 
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)395 static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
396                                                 int32_t src_stride,
397                                                 const uint8_t *dst,
398                                                 int32_t dst_stride,
399                                                 const uint8_t *filter,
400                                                 int32_t height,
401                                                 int32_t *diff) {
402   int16_t filtval;
403   uint32_t loop_cnt;
404   uint32_t ref0, ref1, ref2, ref3;
405   v16u8 filt0, ref = { 0 };
406   v16i8 src0, src1, src2, src3;
407   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
408   v8u16 vec0, vec1, vec2, vec3;
409   v8i16 avg = { 0 };
410   v4i32 vec, var = { 0 };
411 
412   filtval = LH(filter);
413   filt0 = (v16u8)__msa_fill_h(filtval);
414 
415   for (loop_cnt = (height >> 2); loop_cnt--;) {
416     LD_SB4(src, src_stride, src0, src1, src2, src3);
417     src += (4 * src_stride);
418     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
419     dst += (4 * dst_stride);
420     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
421     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
422     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
423     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
424                 vec0, vec1, vec2, vec3);
425     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
426     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
427                 src0, src1, src2, src3);
428     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
429     src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
430     CALC_MSE_AVG_B(src0, ref, var, avg);
431   }
432 
433   vec = __msa_hadd_s_w(avg, avg);
434   *diff = HADD_SW_S32(vec);
435 
436   return HADD_SW_S32(var);
437 }
438 
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)439 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
440                                                 int32_t src_stride,
441                                                 const uint8_t *dst,
442                                                 int32_t dst_stride,
443                                                 const uint8_t *filter,
444                                                 int32_t height,
445                                                 int32_t *diff) {
446   int16_t filtval;
447   uint32_t loop_cnt;
448   v16u8 filt0, out, ref0, ref1, ref2, ref3;
449   v16i8 src0, src1, src2, src3;
450   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
451   v8u16 vec0, vec1, vec2, vec3;
452   v8i16 avg = { 0 };
453   v4i32 vec, var = { 0 };
454 
455   filtval = LH(filter);
456   filt0 = (v16u8)__msa_fill_h(filtval);
457 
458   for (loop_cnt = (height >> 2); loop_cnt--;) {
459     LD_SB4(src, src_stride, src0, src1, src2, src3);
460     src += (4 * src_stride);
461     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
462     dst += (4 * dst_stride);
463 
464     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
465     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
466     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
467     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
468                 vec0, vec1, vec2, vec3);
469     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
470     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
471                 src0, src1, src2, src3);
472     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
473     CALC_MSE_AVG_B(out, ref0, var, avg);
474     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
475     CALC_MSE_AVG_B(out, ref1, var, avg);
476   }
477 
478   vec = __msa_hadd_s_w(avg, avg);
479   *diff = HADD_SW_S32(vec);
480 
481   return HADD_SW_S32(var);
482 }
483 
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)484 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
485                                                  int32_t src_stride,
486                                                  const uint8_t *dst,
487                                                  int32_t dst_stride,
488                                                  const uint8_t *filter,
489                                                  int32_t height,
490                                                  int32_t *diff) {
491   int16_t filtval;
492   uint32_t loop_cnt;
493   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
494   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
495   v16u8 dst0, dst1, dst2, dst3, filt0;
496   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
497   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
498   v8i16 avg = { 0 };
499   v4i32 vec, var = { 0 };
500 
501   filtval = LH(filter);
502   filt0 = (v16u8)__msa_fill_h(filtval);
503 
504   for (loop_cnt = (height >> 2); loop_cnt--;) {
505     LD_SB4(src, src_stride, src0, src2, src4, src6);
506     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
507     src += (4 * src_stride);
508     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
509     dst += (4 * dst_stride);
510 
511     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
512     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
513     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
514     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
515     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
516                 out0, out1, out2, out3);
517     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
518                 out4, out5, out6, out7);
519     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
520     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
521     PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
522                 src0, src1, src2, src3);
523     CALC_MSE_AVG_B(src0, dst0, var, avg);
524     CALC_MSE_AVG_B(src1, dst1, var, avg);
525     CALC_MSE_AVG_B(src2, dst2, var, avg);
526     CALC_MSE_AVG_B(src3, dst3, var, avg);
527   }
528 
529   vec = __msa_hadd_s_w(avg, avg);
530   *diff = HADD_SW_S32(vec);
531 
532   return HADD_SW_S32(var);
533 }
534 
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)535 static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
536                                                  int32_t src_stride,
537                                                  const uint8_t *dst,
538                                                  int32_t dst_stride,
539                                                  const uint8_t *filter,
540                                                  int32_t height,
541                                                  int32_t *diff) {
542   uint32_t loop_cnt, sse = 0;
543   int32_t diff0[2];
544 
545   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
546     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
547                                             filter, height, &diff0[loop_cnt]);
548     src += 16;
549     dst += 16;
550   }
551 
552   *diff = diff0[0] + diff0[1];
553 
554   return sse;
555 }
556 
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)557 static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
558                                                  int32_t src_stride,
559                                                  const uint8_t *dst,
560                                                  int32_t dst_stride,
561                                                  const uint8_t *filter,
562                                                  int32_t height,
563                                                  int32_t *diff) {
564   uint32_t loop_cnt, sse = 0;
565   int32_t diff0[4];
566 
567   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
568     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
569                                             filter, height, &diff0[loop_cnt]);
570     src += 16;
571     dst += 16;
572   }
573 
574   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
575 
576   return sse;
577 }
578 
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)579 static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
580                                                 int32_t src_stride,
581                                                 const uint8_t *dst,
582                                                 int32_t dst_stride,
583                                                 const uint8_t *filter,
584                                                 int32_t height,
585                                                 int32_t *diff) {
586   int16_t filtval;
587   uint32_t loop_cnt;
588   uint32_t ref0, ref1, ref2, ref3;
589   v16u8 src0, src1, src2, src3, src4, out;
590   v16u8 src10_r, src32_r, src21_r, src43_r;
591   v16u8 ref = { 0 };
592   v16u8 src2110, src4332;
593   v16u8 filt0;
594   v8i16 avg = { 0 };
595   v4i32 vec, var = { 0 };
596   v8u16 tmp0, tmp1;
597 
598   filtval = LH(filter);
599   filt0 = (v16u8)__msa_fill_h(filtval);
600 
601   src0 = LD_UB(src);
602   src += src_stride;
603 
604   for (loop_cnt = (height >> 2); loop_cnt--;) {
605     LD_UB4(src, src_stride, src1, src2, src3, src4);
606     src += (4 * src_stride);
607     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
608     dst += (4 * dst_stride);
609 
610     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
611     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
612                src10_r, src21_r, src32_r, src43_r);
613     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
614     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
615     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
616     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
617     CALC_MSE_AVG_B(out, ref, var, avg);
618     src0 = src4;
619   }
620 
621   vec = __msa_hadd_s_w(avg, avg);
622   *diff = HADD_SW_S32(vec);
623 
624   return HADD_SW_S32(var);
625 }
626 
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)627 static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
628                                                 int32_t src_stride,
629                                                 const uint8_t *dst,
630                                                 int32_t dst_stride,
631                                                 const uint8_t *filter,
632                                                 int32_t height,
633                                                 int32_t *diff) {
634   int16_t filtval;
635   uint32_t loop_cnt;
636   v16u8 src0, src1, src2, src3, src4;
637   v16u8 ref0, ref1, ref2, ref3;
638   v8u16 vec0, vec1, vec2, vec3;
639   v8u16 tmp0, tmp1, tmp2, tmp3;
640   v16u8 filt0;
641   v8i16 avg = { 0 };
642   v4i32 vec, var = { 0 };
643 
644   filtval = LH(filter);
645   filt0 = (v16u8)__msa_fill_h(filtval);
646 
647   src0 = LD_UB(src);
648   src += src_stride;
649 
650   for (loop_cnt = (height >> 2); loop_cnt--;) {
651     LD_UB4(src, src_stride, src1, src2, src3, src4);
652     src += (4 * src_stride);
653     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
654     dst += (4 * dst_stride);
655 
656     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
657     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
658                vec0, vec1, vec2, vec3);
659     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
660                 tmp0, tmp1, tmp2, tmp3);
661     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
662     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
663     CALC_MSE_AVG_B(src0, ref0, var, avg);
664     CALC_MSE_AVG_B(src1, ref1, var, avg);
665     src0 = src4;
666   }
667 
668   vec = __msa_hadd_s_w(avg, avg);
669   *diff = HADD_SW_S32(vec);
670 
671   return HADD_SW_S32(var);
672 }
673 
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)674 static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
675                                                  int32_t src_stride,
676                                                  const uint8_t *dst,
677                                                  int32_t dst_stride,
678                                                  const uint8_t *filter,
679                                                  int32_t height,
680                                                  int32_t *diff) {
681   int16_t filtval;
682   uint32_t loop_cnt;
683   v16u8 ref0, ref1, ref2, ref3;
684   v16u8 src0, src1, src2, src3, src4;
685   v16u8 out0, out1, out2, out3;
686   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
687   v8u16 tmp0, tmp1, tmp2, tmp3;
688   v16u8 filt0;
689   v8i16 avg = { 0 };
690   v4i32 vec, var = { 0 };
691 
692   filtval = LH(filter);
693   filt0 = (v16u8)__msa_fill_h(filtval);
694 
695   src0 = LD_UB(src);
696   src += src_stride;
697 
698   for (loop_cnt = (height >> 2); loop_cnt--;) {
699     LD_UB4(src, src_stride, src1, src2, src3, src4);
700     src += (4 * src_stride);
701     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
702     dst += (4 * dst_stride);
703 
704     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
705     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
706     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
707     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
708     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
709 
710     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
711     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
712     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
713     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
714     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
715 
716     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
717     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
718     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
719     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
720     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
721     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
722 
723     src0 = src4;
724 
725     CALC_MSE_AVG_B(out0, ref0, var, avg);
726     CALC_MSE_AVG_B(out1, ref1, var, avg);
727     CALC_MSE_AVG_B(out2, ref2, var, avg);
728     CALC_MSE_AVG_B(out3, ref3, var, avg);
729   }
730 
731   vec = __msa_hadd_s_w(avg, avg);
732   *diff = HADD_SW_S32(vec);
733 
734   return HADD_SW_S32(var);
735 }
736 
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)737 static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
738                                                  int32_t src_stride,
739                                                  const uint8_t *dst,
740                                                  int32_t dst_stride,
741                                                  const uint8_t *filter,
742                                                  int32_t height,
743                                                  int32_t *diff) {
744   uint32_t loop_cnt, sse = 0;
745   int32_t diff0[2];
746 
747   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
748     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
749                                             filter, height, &diff0[loop_cnt]);
750     src += 16;
751     dst += 16;
752   }
753 
754   *diff = diff0[0] + diff0[1];
755 
756   return sse;
757 }
758 
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)759 static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
760                                                  int32_t src_stride,
761                                                  const uint8_t *dst,
762                                                  int32_t dst_stride,
763                                                  const uint8_t *filter,
764                                                  int32_t height,
765                                                  int32_t *diff) {
766   uint32_t loop_cnt, sse = 0;
767   int32_t diff0[4];
768 
769   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
770     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
771                                             filter, height, &diff0[loop_cnt]);
772     src += 16;
773     dst += 16;
774   }
775 
776   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
777 
778   return sse;
779 }
780 
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)781 static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
782                                                  int32_t src_stride,
783                                                  const uint8_t *dst,
784                                                  int32_t dst_stride,
785                                                  const uint8_t *filter_horiz,
786                                                  const uint8_t *filter_vert,
787                                                  int32_t height,
788                                                  int32_t *diff) {
789   int16_t filtval;
790   uint32_t loop_cnt;
791   uint32_t ref0, ref1, ref2, ref3;
792   v16u8 src0, src1, src2, src3, src4;
793   v16u8 out, ref = { 0 };
794   v16u8 filt_vt, filt_hz, vec0, vec1;
795   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
796   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
797   v8u16 tmp0, tmp1;
798   v8i16 avg = { 0 };
799   v4i32 vec, var = { 0 };
800 
801   filtval = LH(filter_horiz);
802   filt_hz = (v16u8)__msa_fill_h(filtval);
803   filtval = LH(filter_vert);
804   filt_vt = (v16u8)__msa_fill_h(filtval);
805 
806   src0 = LD_UB(src);
807   src += src_stride;
808 
809   for (loop_cnt = (height >> 2); loop_cnt--;) {
810     LD_UB4(src, src_stride, src1, src2, src3, src4);
811     src += (4 * src_stride);
812     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
813     dst += (4 * dst_stride);
814     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
815     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
816     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
817     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
818     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
819     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
820     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
821     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
822     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
823     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
824     CALC_MSE_AVG_B(out, ref, var, avg);
825     src0 = src4;
826   }
827 
828   vec = __msa_hadd_s_w(avg, avg);
829   *diff = HADD_SW_S32(vec);
830 
831   return HADD_SW_S32(var);
832 }
833 
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)834 static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
835                                                  int32_t src_stride,
836                                                  const uint8_t *dst,
837                                                  int32_t dst_stride,
838                                                  const uint8_t *filter_horiz,
839                                                  const uint8_t *filter_vert,
840                                                  int32_t height,
841                                                  int32_t *diff) {
842   int16_t filtval;
843   uint32_t loop_cnt;
844   v16u8 ref0, ref1, ref2, ref3;
845   v16u8 src0, src1, src2, src3, src4;
846   v16u8 out0, out1;
847   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
848   v8u16 hz_out0, hz_out1;
849   v8u16 tmp0, tmp1, tmp2, tmp3;
850   v16u8 filt_vt, filt_hz, vec0;
851   v8i16 avg = { 0 };
852   v4i32 vec, var = { 0 };
853 
854   filtval = LH(filter_horiz);
855   filt_hz = (v16u8)__msa_fill_h(filtval);
856   filtval = LH(filter_vert);
857   filt_vt = (v16u8)__msa_fill_h(filtval);
858 
859   src0 = LD_UB(src);
860   src += src_stride;
861   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
862 
863   for (loop_cnt = (height >> 2); loop_cnt--;) {
864     LD_UB4(src, src_stride, src1, src2, src3, src4);
865     src += (4 * src_stride);
866     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
867     dst += (4 * dst_stride);
868 
869     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
870     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
871     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
872     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
873     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
874     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
875     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
876     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
877     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
878     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
879     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
880     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
881     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
882     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
883     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
884     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
885     CALC_MSE_AVG_B(out0, ref0, var, avg);
886     CALC_MSE_AVG_B(out1, ref1, var, avg);
887   }
888 
889   vec = __msa_hadd_s_w(avg, avg);
890   *diff = HADD_SW_S32(vec);
891 
892   return HADD_SW_S32(var);
893 }
894 
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)895 static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
896                                                   int32_t src_stride,
897                                                   const uint8_t *dst,
898                                                   int32_t dst_stride,
899                                                   const uint8_t *filter_horiz,
900                                                   const uint8_t *filter_vert,
901                                                   int32_t height,
902                                                   int32_t *diff) {
903   int16_t filtval;
904   uint32_t loop_cnt;
905   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
906   v16u8 ref0, ref1, ref2, ref3;
907   v16u8 filt_hz, filt_vt, vec0, vec1;
908   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
909   v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
910   v8u16 tmp0, tmp1;
911   v8i16 avg = { 0 };
912   v4i32 vec, var = { 0 };
913 
914   filtval = LH(filter_horiz);
915   filt_hz = (v16u8)__msa_fill_h(filtval);
916   filtval = LH(filter_vert);
917   filt_vt = (v16u8)__msa_fill_h(filtval);
918 
919   LD_UB2(src, 8, src0, src1);
920   src += src_stride;
921 
922   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
923   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
924 
925   for (loop_cnt = (height >> 2); loop_cnt--;) {
926     LD_UB4(src, src_stride, src0, src2, src4, src6);
927     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
928     src += (4 * src_stride);
929     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
930     dst += (4 * dst_stride);
931 
932     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
933     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
934     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
935     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
936     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
937     src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
938 
939     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
940     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
941     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
942     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
943     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
944     src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
945 
946     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
947     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
948     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
949     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
950     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
951     src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
952 
953     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
954     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
955     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
956     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
957     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
958     src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
959 
960     CALC_MSE_AVG_B(src0, ref0, var, avg);
961     CALC_MSE_AVG_B(src1, ref1, var, avg);
962     CALC_MSE_AVG_B(src2, ref2, var, avg);
963     CALC_MSE_AVG_B(src3, ref3, var, avg);
964   }
965 
966   vec = __msa_hadd_s_w(avg, avg);
967   *diff = HADD_SW_S32(vec);
968 
969   return HADD_SW_S32(var);
970 }
971 
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)972 static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
973                                                   int32_t src_stride,
974                                                   const uint8_t *dst,
975                                                   int32_t dst_stride,
976                                                   const uint8_t *filter_horiz,
977                                                   const uint8_t *filter_vert,
978                                                   int32_t height,
979                                                   int32_t *diff) {
980   uint32_t loop_cnt, sse = 0;
981   int32_t diff0[2];
982 
983   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
984     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
985                                              filter_horiz, filter_vert, height,
986                                              &diff0[loop_cnt]);
987     src += 16;
988     dst += 16;
989   }
990 
991   *diff = diff0[0] + diff0[1];
992 
993   return sse;
994 }
995 
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)996 static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
997                                                   int32_t src_stride,
998                                                   const uint8_t *dst,
999                                                   int32_t dst_stride,
1000                                                   const uint8_t *filter_horiz,
1001                                                   const uint8_t *filter_vert,
1002                                                   int32_t height,
1003                                                   int32_t *diff) {
1004   uint32_t loop_cnt, sse = 0;
1005   int32_t diff0[4];
1006 
1007   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1008     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
1009                                              filter_horiz, filter_vert, height,
1010                                              &diff0[loop_cnt]);
1011     src += 16;
1012     dst += 16;
1013   }
1014 
1015   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1016 
1017   return sse;
1018 }
1019 
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1020 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
1021                                                     int32_t src_stride,
1022                                                     const uint8_t *dst,
1023                                                     int32_t dst_stride,
1024                                                     const uint8_t *sec_pred,
1025                                                     const uint8_t *filter,
1026                                                     int32_t height,
1027                                                     int32_t *diff) {
1028   int16_t filtval;
1029   uint32_t loop_cnt;
1030   uint32_t ref0, ref1, ref2, ref3;
1031   v16u8 out, pred, filt0, ref = { 0 };
1032   v16i8 src0, src1, src2, src3;
1033   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1034   v8u16 vec0, vec1, vec2, vec3;
1035   v8i16 avg = { 0 };
1036   v4i32 vec, var = { 0 };
1037 
1038   filtval = LH(filter);
1039   filt0 = (v16u8)__msa_fill_h(filtval);
1040 
1041   for (loop_cnt = (height >> 2); loop_cnt--;) {
1042     LD_SB4(src, src_stride, src0, src1, src2, src3);
1043     src += (4 * src_stride);
1044     pred = LD_UB(sec_pred);
1045     sec_pred += 16;
1046     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1047     dst += (4 * dst_stride);
1048 
1049     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1050     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1051     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1052     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1053                 vec0, vec1, vec2, vec3);
1054     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1055     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1056                 src0, src1, src2, src3);
1057     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
1058     out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
1059     out = __msa_aver_u_b(out, pred);
1060     CALC_MSE_AVG_B(out, ref, var, avg);
1061   }
1062 
1063   vec = __msa_hadd_s_w(avg, avg);
1064   *diff = HADD_SW_S32(vec);
1065 
1066   return HADD_SW_S32(var);
1067 }
1068 
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1069 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
1070                                                     int32_t src_stride,
1071                                                     const uint8_t *dst,
1072                                                     int32_t dst_stride,
1073                                                     const uint8_t *sec_pred,
1074                                                     const uint8_t *filter,
1075                                                     int32_t height,
1076                                                     int32_t *diff) {
1077   int16_t filtval;
1078   uint32_t loop_cnt;
1079   v16u8 out, pred, filt0;
1080   v16u8 ref0, ref1, ref2, ref3;
1081   v16i8 src0, src1, src2, src3;
1082   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1083   v8u16 vec0, vec1, vec2, vec3;
1084   v8i16 avg = { 0 };
1085   v4i32 vec, var = { 0 };
1086 
1087   filtval = LH(filter);
1088   filt0 = (v16u8)__msa_fill_h(filtval);
1089 
1090   for (loop_cnt = (height >> 2); loop_cnt--;) {
1091     LD_SB4(src, src_stride, src0, src1, src2, src3);
1092     src += (4 * src_stride);
1093     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1094     dst += (4 * dst_stride);
1095 
1096     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1097     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1098     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1099     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1100                 vec0, vec1, vec2, vec3);
1101     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1102     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1103                 src0, src1, src2, src3);
1104     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1105 
1106     pred = LD_UB(sec_pred);
1107     sec_pred += 16;
1108     out = __msa_aver_u_b(out, pred);
1109     CALC_MSE_AVG_B(out, ref0, var, avg);
1110     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1111     pred = LD_UB(sec_pred);
1112     sec_pred += 16;
1113     out = __msa_aver_u_b(out, pred);
1114     CALC_MSE_AVG_B(out, ref1, var, avg);
1115   }
1116 
1117   vec = __msa_hadd_s_w(avg, avg);
1118   *diff = HADD_SW_S32(vec);
1119 
1120   return HADD_SW_S32(var);
1121 }
1122 
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1123 static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
1124                                              int32_t src_stride,
1125                                              const uint8_t *dst,
1126                                              int32_t dst_stride,
1127                                              const uint8_t *sec_pred,
1128                                              const uint8_t *filter,
1129                                              int32_t height,
1130                                              int32_t *diff,
1131                                              int32_t width) {
1132   int16_t filtval;
1133   uint32_t loop_cnt;
1134   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1135   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1136   v16u8 dst0, dst1, dst2, dst3;
1137   v16u8 tmp0, tmp1, tmp2, tmp3;
1138   v16u8 pred0, pred1, pred2, pred3, filt0;
1139   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1140   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1141   v8i16 avg = { 0 };
1142   v4i32 vec, var = { 0 };
1143 
1144   filtval = LH(filter);
1145   filt0 = (v16u8)__msa_fill_h(filtval);
1146 
1147   for (loop_cnt = (height >> 2); loop_cnt--;) {
1148     LD_SB4(src, src_stride, src0, src2, src4, src6);
1149     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1150     src += (4 * src_stride);
1151     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1152     dst += (4 * dst_stride);
1153     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1154     sec_pred += (4 * width);
1155 
1156     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1157     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1158     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1159     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1160     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1161                 out0, out1, out2, out3);
1162     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1163                 out4, out5, out6, out7);
1164     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1165     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1166     PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
1167                 tmp0, tmp1, tmp2, tmp3);
1168     AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
1169                 tmp0, tmp1, tmp2, tmp3);
1170 
1171     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1172     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1173     CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1174     CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1175   }
1176 
1177   vec = __msa_hadd_s_w(avg, avg);
1178   *diff = HADD_SW_S32(vec);
1179 
1180   return HADD_SW_S32(var);
1181 }
1182 
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1183 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
1184                                                      int32_t src_stride,
1185                                                      const uint8_t *dst,
1186                                                      int32_t dst_stride,
1187                                                      const uint8_t *sec_pred,
1188                                                      const uint8_t *filter,
1189                                                      int32_t height,
1190                                                      int32_t *diff) {
1191   return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1192                                       sec_pred, filter, height, diff, 16);
1193 }
1194 
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1195 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
1196                                                      int32_t src_stride,
1197                                                      const uint8_t *dst,
1198                                                      int32_t dst_stride,
1199                                                      const uint8_t *sec_pred,
1200                                                      const uint8_t *filter,
1201                                                      int32_t height,
1202                                                      int32_t *diff) {
1203   uint32_t loop_cnt, sse = 0;
1204   int32_t diff0[2];
1205 
1206   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1207     sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1208                                         sec_pred, filter, height,
1209                                         &diff0[loop_cnt], 32);
1210     src += 16;
1211     dst += 16;
1212     sec_pred += 16;
1213   }
1214 
1215   *diff = diff0[0] + diff0[1];
1216 
1217   return sse;
1218 }
1219 
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1220 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
1221                                                      int32_t src_stride,
1222                                                      const uint8_t *dst,
1223                                                      int32_t dst_stride,
1224                                                      const uint8_t *sec_pred,
1225                                                      const uint8_t *filter,
1226                                                      int32_t height,
1227                                                      int32_t *diff) {
1228   uint32_t loop_cnt, sse = 0;
1229   int32_t diff0[4];
1230 
1231   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1232     sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1233                                         sec_pred, filter, height,
1234                                         &diff0[loop_cnt], 64);
1235     src += 16;
1236     dst += 16;
1237     sec_pred += 16;
1238   }
1239 
1240   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1241 
1242   return sse;
1243 }
1244 
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1245 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
1246                                                     int32_t src_stride,
1247                                                     const uint8_t *dst,
1248                                                     int32_t dst_stride,
1249                                                     const uint8_t *sec_pred,
1250                                                     const uint8_t *filter,
1251                                                     int32_t height,
1252                                                     int32_t *diff) {
1253   int16_t filtval;
1254   uint32_t loop_cnt;
1255   uint32_t ref0, ref1, ref2, ref3;
1256   v16u8 src0, src1, src2, src3, src4;
1257   v16u8 src10_r, src32_r, src21_r, src43_r;
1258   v16u8 out, pred, ref = { 0 };
1259   v16u8 src2110, src4332, filt0;
1260   v8i16 avg = { 0 };
1261   v4i32 vec, var = { 0 };
1262   v8u16 tmp0, tmp1;
1263 
1264   filtval = LH(filter);
1265   filt0 = (v16u8)__msa_fill_h(filtval);
1266 
1267   src0 = LD_UB(src);
1268   src += src_stride;
1269 
1270   for (loop_cnt = (height >> 2); loop_cnt--;) {
1271     LD_UB4(src, src_stride, src1, src2, src3, src4);
1272     src += (4 * src_stride);
1273     pred = LD_UB(sec_pred);
1274     sec_pred += 16;
1275     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1276     dst += (4 * dst_stride);
1277 
1278     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1279     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1280                src10_r, src21_r, src32_r, src43_r);
1281     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1282     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1283     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1284 
1285     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1286     out = __msa_aver_u_b(out, pred);
1287     CALC_MSE_AVG_B(out, ref, var, avg);
1288     src0 = src4;
1289   }
1290 
1291   vec = __msa_hadd_s_w(avg, avg);
1292   *diff = HADD_SW_S32(vec);
1293 
1294   return HADD_SW_S32(var);
1295 }
1296 
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1297 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
1298                                                     int32_t src_stride,
1299                                                     const uint8_t *dst,
1300                                                     int32_t dst_stride,
1301                                                     const uint8_t *sec_pred,
1302                                                     const uint8_t *filter,
1303                                                     int32_t height,
1304                                                     int32_t *diff) {
1305   int16_t filtval;
1306   uint32_t loop_cnt;
1307   v16u8 src0, src1, src2, src3, src4;
1308   v16u8 ref0, ref1, ref2, ref3;
1309   v16u8 pred0, pred1, filt0;
1310   v8u16 vec0, vec1, vec2, vec3;
1311   v8u16 tmp0, tmp1, tmp2, tmp3;
1312   v8i16 avg = { 0 };
1313   v4i32 vec, var = { 0 };
1314 
1315   filtval = LH(filter);
1316   filt0 = (v16u8)__msa_fill_h(filtval);
1317 
1318   src0 = LD_UB(src);
1319   src += src_stride;
1320 
1321   for (loop_cnt = (height >> 2); loop_cnt--;) {
1322     LD_UB4(src, src_stride, src1, src2, src3, src4);
1323     src += (4 * src_stride);
1324     LD_UB2(sec_pred, 16, pred0, pred1);
1325     sec_pred += 32;
1326     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1327     dst += (4 * dst_stride);
1328     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1329     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
1330                vec0, vec1, vec2, vec3);
1331     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1332                 tmp0, tmp1, tmp2, tmp3);
1333     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1334     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1335     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1336     CALC_MSE_AVG_B(src0, ref0, var, avg);
1337     CALC_MSE_AVG_B(src1, ref1, var, avg);
1338 
1339     src0 = src4;
1340   }
1341 
1342   vec = __msa_hadd_s_w(avg, avg);
1343   *diff = HADD_SW_S32(vec);
1344 
1345   return HADD_SW_S32(var);
1346 }
1347 
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1348 static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
1349                                              int32_t src_stride,
1350                                              const uint8_t *dst,
1351                                              int32_t dst_stride,
1352                                              const uint8_t *sec_pred,
1353                                              const uint8_t *filter,
1354                                              int32_t height,
1355                                              int32_t *diff,
1356                                              int32_t width) {
1357   int16_t filtval;
1358   uint32_t loop_cnt;
1359   v16u8 ref0, ref1, ref2, ref3;
1360   v16u8 pred0, pred1, pred2, pred3;
1361   v16u8 src0, src1, src2, src3, src4;
1362   v16u8 out0, out1, out2, out3, filt0;
1363   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1364   v8u16 tmp0, tmp1, tmp2, tmp3;
1365   v8i16 avg = { 0 };
1366   v4i32 vec, var = { 0 };
1367 
1368   filtval = LH(filter);
1369   filt0 = (v16u8)__msa_fill_h(filtval);
1370 
1371   src0 = LD_UB(src);
1372   src += src_stride;
1373 
1374   for (loop_cnt = (height >> 2); loop_cnt--;) {
1375     LD_UB4(src, src_stride, src1, src2, src3, src4);
1376     src += (4 * src_stride);
1377     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1378     sec_pred += (4 * width);
1379 
1380     ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1381     ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1382     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1383     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1384     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1385 
1386     ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1387     ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1388     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1389     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1390     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1391 
1392     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1393     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1394     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1395 
1396     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1397     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1398     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1399 
1400     src0 = src4;
1401     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1402     dst += (4 * dst_stride);
1403 
1404     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1405                 out0, out1, out2, out3);
1406 
1407     CALC_MSE_AVG_B(out0, ref0, var, avg);
1408     CALC_MSE_AVG_B(out1, ref1, var, avg);
1409     CALC_MSE_AVG_B(out2, ref2, var, avg);
1410     CALC_MSE_AVG_B(out3, ref3, var, avg);
1411   }
1412 
1413   vec = __msa_hadd_s_w(avg, avg);
1414   *diff = HADD_SW_S32(vec);
1415 
1416   return HADD_SW_S32(var);
1417 }
1418 
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1419 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
1420                                                      int32_t src_stride,
1421                                                      const uint8_t *dst,
1422                                                      int32_t dst_stride,
1423                                                      const uint8_t *sec_pred,
1424                                                      const uint8_t *filter,
1425                                                      int32_t height,
1426                                                      int32_t *diff) {
1427   return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1428                                       sec_pred, filter, height, diff, 16);
1429 }
1430 
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1431 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
1432                                                      int32_t src_stride,
1433                                                      const uint8_t *dst,
1434                                                      int32_t dst_stride,
1435                                                      const uint8_t *sec_pred,
1436                                                      const uint8_t *filter,
1437                                                      int32_t height,
1438                                                      int32_t *diff) {
1439   uint32_t loop_cnt, sse = 0;
1440   int32_t diff0[2];
1441 
1442   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1443     sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1444                                         sec_pred, filter, height,
1445                                         &diff0[loop_cnt], 32);
1446     src += 16;
1447     dst += 16;
1448     sec_pred += 16;
1449   }
1450 
1451   *diff = diff0[0] + diff0[1];
1452 
1453   return sse;
1454 }
1455 
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1456 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
1457                                                      int32_t src_stride,
1458                                                      const uint8_t *dst,
1459                                                      int32_t dst_stride,
1460                                                      const uint8_t *sec_pred,
1461                                                      const uint8_t *filter,
1462                                                      int32_t height,
1463                                                      int32_t *diff) {
1464   uint32_t loop_cnt, sse = 0;
1465   int32_t diff0[4];
1466 
1467   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1468     sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1469                                         sec_pred, filter, height,
1470                                         &diff0[loop_cnt], 64);
1471     src += 16;
1472     dst += 16;
1473     sec_pred += 16;
1474   }
1475 
1476   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1477 
1478   return sse;
1479 }
1480 
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1481 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1482   const uint8_t *src, int32_t src_stride,
1483   const uint8_t *dst, int32_t dst_stride,
1484   const uint8_t *sec_pred,
1485   const uint8_t *filter_horiz, const uint8_t *filter_vert,
1486   int32_t height, int32_t *diff) {
1487   int16_t filtval;
1488   uint32_t loop_cnt;
1489   uint32_t ref0, ref1, ref2, ref3;
1490   v16u8 src0, src1, src2, src3, src4;
1491   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1492   v16u8 filt_hz, filt_vt, vec0, vec1;
1493   v16u8 out, pred, ref = { 0 };
1494   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1495   v8i16 avg = { 0 };
1496   v4i32 vec, var = { 0 };
1497 
1498   filtval = LH(filter_horiz);
1499   filt_hz = (v16u8)__msa_fill_h(filtval);
1500   filtval = LH(filter_vert);
1501   filt_vt = (v16u8)__msa_fill_h(filtval);
1502 
1503   src0 = LD_UB(src);
1504   src += src_stride;
1505 
1506   for (loop_cnt = (height >> 2); loop_cnt--;) {
1507     LD_UB4(src, src_stride, src1, src2, src3, src4);
1508     src += (4 * src_stride);
1509     pred = LD_UB(sec_pred);
1510     sec_pred += 16;
1511     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1512     dst += (4 * dst_stride);
1513     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1514     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1515     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1516     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1517     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1518     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1519     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1520     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1521     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1522     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1523     out = __msa_aver_u_b(out, pred);
1524     CALC_MSE_AVG_B(out, ref, var, avg);
1525     src0 = src4;
1526   }
1527 
1528   vec = __msa_hadd_s_w(avg, avg);
1529   *diff = HADD_SW_S32(vec);
1530 
1531   return HADD_SW_S32(var);
1532 }
1533 
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1534 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1535   const uint8_t *src, int32_t src_stride,
1536   const uint8_t *dst, int32_t dst_stride,
1537   const uint8_t *sec_pred,
1538   const uint8_t *filter_horiz, const uint8_t *filter_vert,
1539   int32_t height, int32_t *diff) {
1540   int16_t filtval;
1541   uint32_t loop_cnt;
1542   v16u8 ref0, ref1, ref2, ref3;
1543   v16u8 src0, src1, src2, src3, src4;
1544   v16u8 pred0, pred1, out0, out1;
1545   v16u8 filt_hz, filt_vt, vec0;
1546   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1547   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1548   v8i16 avg = { 0 };
1549   v4i32 vec, var = { 0 };
1550 
1551   filtval = LH(filter_horiz);
1552   filt_hz = (v16u8)__msa_fill_h(filtval);
1553   filtval = LH(filter_vert);
1554   filt_vt = (v16u8)__msa_fill_h(filtval);
1555 
1556   src0 = LD_UB(src);
1557   src += src_stride;
1558   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1559 
1560   for (loop_cnt = (height >> 2); loop_cnt--;) {
1561     LD_UB4(src, src_stride, src1, src2, src3, src4);
1562     src += (4 * src_stride);
1563     LD_UB2(sec_pred, 16, pred0, pred1);
1564     sec_pred += 32;
1565     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1566     dst += (4 * dst_stride);
1567 
1568     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1569     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1570 
1571     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1572     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1573     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1574 
1575     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1576     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1577     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1578     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1579 
1580     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1581     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1582     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1583 
1584     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1585     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1586 
1587     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1588     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1589     AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1590 
1591     CALC_MSE_AVG_B(out0, ref0, var, avg);
1592     CALC_MSE_AVG_B(out1, ref1, var, avg);
1593   }
1594 
1595   vec = __msa_hadd_s_w(avg, avg);
1596   *diff = HADD_SW_S32(vec);
1597 
1598   return HADD_SW_S32(var);
1599 }
1600 
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1601 static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
1602                                               int32_t src_stride,
1603                                               const uint8_t *dst,
1604                                               int32_t dst_stride,
1605                                               const uint8_t *sec_pred,
1606                                               const uint8_t *filter_horiz,
1607                                               const uint8_t *filter_vert,
1608                                               int32_t height,
1609                                               int32_t *diff,
1610                                               int32_t width) {
1611   int16_t filtval;
1612   uint32_t loop_cnt;
1613   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1614   v16u8 ref0, ref1, ref2, ref3;
1615   v16u8 pred0, pred1, pred2, pred3;
1616   v16u8 out0, out1, out2, out3;
1617   v16u8 filt_hz, filt_vt, vec0, vec1;
1618   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1619   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1620   v8i16 avg = { 0 };
1621   v4i32 vec, var = { 0 };
1622 
1623   filtval = LH(filter_horiz);
1624   filt_hz = (v16u8)__msa_fill_h(filtval);
1625   filtval = LH(filter_vert);
1626   filt_vt = (v16u8)__msa_fill_h(filtval);
1627 
1628   LD_UB2(src, 8, src0, src1);
1629   src += src_stride;
1630 
1631   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1632   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1633 
1634   for (loop_cnt = (height >> 2); loop_cnt--;) {
1635     LD_UB4(src, src_stride, src0, src2, src4, src6);
1636     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1637     src += (4 * src_stride);
1638     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1639     sec_pred += (4 * width);
1640 
1641     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1642     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1643     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1644     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1645     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1646     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1647 
1648     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1649     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1650     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1651     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1652     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1653     out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1654 
1655     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1656     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1657     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1658     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1659     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1660     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1661 
1662     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1663     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1664     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1665     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1666     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1667     out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1668 
1669     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1670     dst += (4 * dst_stride);
1671 
1672     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1673                 out0, out1, out2, out3);
1674 
1675     CALC_MSE_AVG_B(out0, ref0, var, avg);
1676     CALC_MSE_AVG_B(out1, ref1, var, avg);
1677     CALC_MSE_AVG_B(out2, ref2, var, avg);
1678     CALC_MSE_AVG_B(out3, ref3, var, avg);
1679   }
1680 
1681   vec = __msa_hadd_s_w(avg, avg);
1682   *diff = HADD_SW_S32(vec);
1683 
1684   return HADD_SW_S32(var);
1685 }
1686 
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1687 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1688   const uint8_t *src, int32_t src_stride,
1689   const uint8_t *dst, int32_t dst_stride,
1690   const uint8_t *sec_pred,
1691   const uint8_t *filter_horiz, const uint8_t *filter_vert,
1692   int32_t height, int32_t *diff) {
1693   return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1694                                        sec_pred, filter_horiz, filter_vert,
1695                                        height, diff, 16);
1696 }
1697 
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1698 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1699   const uint8_t *src, int32_t src_stride,
1700   const uint8_t *dst, int32_t dst_stride,
1701   const uint8_t *sec_pred,
1702   const uint8_t *filter_horiz, const uint8_t *filter_vert,
1703   int32_t height, int32_t *diff) {
1704   uint32_t loop_cnt, sse = 0;
1705   int32_t diff0[2];
1706 
1707   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1708     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1709                                          sec_pred, filter_horiz, filter_vert,
1710                                          height, &diff0[loop_cnt], 32);
1711     src += 16;
1712     dst += 16;
1713     sec_pred += 16;
1714   }
1715 
1716   *diff = diff0[0] + diff0[1];
1717 
1718   return sse;
1719 }
1720 
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1721 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1722   const uint8_t *src, int32_t src_stride,
1723   const uint8_t *dst, int32_t dst_stride,
1724   const uint8_t *sec_pred,
1725   const uint8_t *filter_horiz, const uint8_t *filter_vert,
1726   int32_t height, int32_t *diff) {
1727   uint32_t loop_cnt, sse = 0;
1728   int32_t diff0[4];
1729 
1730   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1731     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1732                                          sec_pred, filter_horiz, filter_vert,
1733                                          height, &diff0[loop_cnt], 64);
1734     src += 16;
1735     dst += 16;
1736     sec_pred += 16;
1737   }
1738 
1739   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1740 
1741   return sse;
1742 }
1743 
1744 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1745 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1746 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1747 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1748 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1749 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1750 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1751 
1752 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1753 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1754 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1755 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1756 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1757 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1758 
1759 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
1760 uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
1761                                                  int32_t src_stride,     \
1762                                                  int32_t xoffset,        \
1763                                                  int32_t yoffset,        \
1764                                                  const uint8_t *ref,     \
1765                                                  int32_t ref_stride,     \
1766                                                  uint32_t *sse) {        \
1767   int32_t diff;                                                          \
1768   uint32_t var;                                                          \
1769   const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
1770   const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
1771                                                                          \
1772   if (yoffset) {                                                         \
1773     if (xoffset) {                                                       \
1774       *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
1775                                                    ref, ref_stride,      \
1776                                                    h_filter, v_filter,   \
1777                                                    ht, &diff);           \
1778     } else {                                                             \
1779       *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
1780                                                   ref, ref_stride,       \
1781                                                   v_filter, ht, &diff);  \
1782     }                                                                    \
1783                                                                          \
1784     var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
1785   } else {                                                               \
1786     if (xoffset) {                                                       \
1787       *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
1788                                                   ref, ref_stride,       \
1789                                                   h_filter, ht, &diff);  \
1790                                                                          \
1791       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
1792     } else {                                                             \
1793       var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
1794                                           ref, ref_stride, sse);         \
1795     }                                                                    \
1796   }                                                                      \
1797                                                                          \
1798   return var;                                                            \
1799 }
1800 
1801 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1802 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1803 
1804 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1805 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1806 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1807 
1808 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1809 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1810 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1811 
1812 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1813 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1814 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1815 
1816 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1817 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1818 
1819 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
1820 uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
1821   const uint8_t *src_ptr, int32_t src_stride,                                 \
1822   int32_t xoffset, int32_t yoffset,                                           \
1823   const uint8_t *ref_ptr, int32_t ref_stride,                                 \
1824   uint32_t *sse, const uint8_t *sec_pred) {                                   \
1825   int32_t diff;                                                               \
1826   const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
1827   const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
1828                                                                               \
1829   if (yoffset) {                                                              \
1830     if (xoffset) {                                                            \
1831       *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
1832                                                        ref_ptr, ref_stride,   \
1833                                                        sec_pred, h_filter,    \
1834                                                        v_filter, ht, &diff);  \
1835     } else {                                                                  \
1836       *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
1837                                                       ref_ptr, ref_stride,    \
1838                                                       sec_pred, v_filter,     \
1839                                                       ht, &diff);             \
1840     }                                                                         \
1841   } else {                                                                    \
1842     if (xoffset) {                                                            \
1843       *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
1844                                                       ref_ptr, ref_stride,    \
1845                                                       sec_pred, h_filter,     \
1846                                                       ht, &diff);             \
1847     } else {                                                                  \
1848       *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
1849                                           ref_ptr, ref_stride,                \
1850                                           sec_pred, ht, &diff);               \
1851     }                                                                         \
1852   }                                                                           \
1853                                                                               \
1854   return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
1855 }
1856 
1857 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1858 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1859 
1860 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1861 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1862 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1863 
1864 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1865 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1866 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1867 
1868 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1869 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1870 
vpx_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t xoffset,int32_t yoffset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1871 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1872                                              int32_t src_stride,
1873                                              int32_t xoffset,
1874                                              int32_t yoffset,
1875                                              const uint8_t *ref_ptr,
1876                                              int32_t ref_stride,
1877                                              uint32_t *sse,
1878                                              const uint8_t *sec_pred) {
1879   int32_t diff;
1880   const uint8_t *h_filter = bilinear_filters_msa[xoffset];
1881   const uint8_t *v_filter = bilinear_filters_msa[yoffset];
1882 
1883   if (yoffset) {
1884     if (xoffset) {
1885       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
1886                                                    ref_ptr, ref_stride,
1887                                                    sec_pred, h_filter,
1888                                                    v_filter, 64, &diff);
1889     } else {
1890       *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
1891                                                   ref_ptr, ref_stride,
1892                                                   sec_pred, v_filter,
1893                                                   64, &diff);
1894     }
1895   } else {
1896     if (xoffset) {
1897       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
1898                                                   ref_ptr, ref_stride,
1899                                                   sec_pred, h_filter,
1900                                                   64, &diff);
1901     } else {
1902       *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1903                                     sec_pred, &diff);
1904     }
1905   }
1906 
1907   return VARIANCE_32Wx64H(*sse, diff);
1908 }
1909 
1910 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
1911 uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
1912                                                  int32_t src_stride,         \
1913                                                  int32_t xoffset,            \
1914                                                  int32_t yoffset,            \
1915                                                  const uint8_t *ref_ptr,     \
1916                                                  int32_t ref_stride,         \
1917                                                  uint32_t *sse,              \
1918                                                  const uint8_t *sec_pred) {  \
1919   int32_t diff;                                                              \
1920   const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
1921   const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
1922                                                                              \
1923   if (yoffset) {                                                             \
1924     if (xoffset) {                                                           \
1925       *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
1926                                                    ref_ptr, ref_stride,      \
1927                                                    sec_pred, h_filter,       \
1928                                                    v_filter, ht, &diff);     \
1929     } else {                                                                 \
1930       *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
1931                                                   ref_ptr, ref_stride,       \
1932                                                   sec_pred, v_filter,        \
1933                                                   ht, &diff);                \
1934     }                                                                        \
1935   } else {                                                                   \
1936     if (xoffset) {                                                           \
1937       *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
1938                                                   ref_ptr, ref_stride,       \
1939                                                   sec_pred, h_filter,        \
1940                                                   ht, &diff);                \
1941     } else {                                                                 \
1942       *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
1943                                         ref_ptr, ref_stride,                 \
1944                                         sec_pred, &diff);                    \
1945     }                                                                        \
1946   }                                                                          \
1947                                                                              \
1948   return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
1949 }
1950 
1951 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1952 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1953