1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14 #include "vpx_dsp/variance.h"
15
16 static const uint8_t bilinear_filters_msa[8][2] = {
17 { 128, 0, },
18 { 112, 16, },
19 { 96, 32, },
20 { 80, 48, },
21 { 64, 64, },
22 { 48, 80, },
23 { 32, 96, },
24 { 16, 112, },
25 };
26
27 #define CALC_MSE_AVG_B(src, ref, var, sub) { \
28 v16u8 src_l0_m, src_l1_m; \
29 v8i16 res_l0_m, res_l1_m; \
30 \
31 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
32 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
33 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
34 \
35 sub += res_l0_m + res_l1_m; \
36 }
37
38 #define VARIANCE_WxH(sse, diff, shift) \
39 sse - (((uint32_t)diff * diff) >> shift)
40
41 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
42 sse - (((int64_t)diff * diff) >> shift)
43
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)44 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
45 int32_t src_stride,
46 const uint8_t *ref_ptr,
47 int32_t ref_stride,
48 const uint8_t *sec_pred,
49 int32_t height,
50 int32_t *diff) {
51 int32_t ht_cnt;
52 uint32_t src0, src1, src2, src3;
53 uint32_t ref0, ref1, ref2, ref3;
54 v16u8 pred, src = { 0 };
55 v16u8 ref = { 0 };
56 v8i16 avg = { 0 };
57 v4i32 vec, var = { 0 };
58
59 for (ht_cnt = (height >> 2); ht_cnt--;) {
60 pred = LD_UB(sec_pred);
61 sec_pred += 16;
62 LW4(src_ptr, src_stride, src0, src1, src2, src3);
63 src_ptr += (4 * src_stride);
64 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
65 ref_ptr += (4 * ref_stride);
66
67 INSERT_W4_UB(src0, src1, src2, src3, src);
68 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
69
70 src = __msa_aver_u_b(src, pred);
71 CALC_MSE_AVG_B(src, ref, var, avg);
72 }
73
74 vec = __msa_hadd_s_w(avg, avg);
75 *diff = HADD_SW_S32(vec);
76
77 return HADD_SW_S32(var);
78 }
79
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)80 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
81 int32_t src_stride,
82 const uint8_t *ref_ptr,
83 int32_t ref_stride,
84 const uint8_t *sec_pred,
85 int32_t height,
86 int32_t *diff) {
87 int32_t ht_cnt;
88 v16u8 src0, src1, src2, src3;
89 v16u8 ref0, ref1, ref2, ref3;
90 v16u8 pred0, pred1;
91 v8i16 avg = { 0 };
92 v4i32 vec, var = { 0 };
93
94 for (ht_cnt = (height >> 2); ht_cnt--;) {
95 LD_UB2(sec_pred, 16, pred0, pred1);
96 sec_pred += 32;
97 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
98 src_ptr += (4 * src_stride);
99 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
100 ref_ptr += (4 * ref_stride);
101
102 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
103 src0, src1, ref0, ref1);
104 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
105 CALC_MSE_AVG_B(src0, ref0, var, avg);
106 CALC_MSE_AVG_B(src1, ref1, var, avg);
107 }
108
109 vec = __msa_hadd_s_w(avg, avg);
110 *diff = HADD_SW_S32(vec);
111
112 return HADD_SW_S32(var);
113 }
114
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)115 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
116 int32_t src_stride,
117 const uint8_t *ref_ptr,
118 int32_t ref_stride,
119 const uint8_t *sec_pred,
120 int32_t height,
121 int32_t *diff) {
122 int32_t ht_cnt;
123 v16u8 src, ref, pred;
124 v8i16 avg = { 0 };
125 v4i32 vec, var = { 0 };
126
127 for (ht_cnt = (height >> 2); ht_cnt--;) {
128 pred = LD_UB(sec_pred);
129 sec_pred += 16;
130 src = LD_UB(src_ptr);
131 src_ptr += src_stride;
132 ref = LD_UB(ref_ptr);
133 ref_ptr += ref_stride;
134 src = __msa_aver_u_b(src, pred);
135 CALC_MSE_AVG_B(src, ref, var, avg);
136
137 pred = LD_UB(sec_pred);
138 sec_pred += 16;
139 src = LD_UB(src_ptr);
140 src_ptr += src_stride;
141 ref = LD_UB(ref_ptr);
142 ref_ptr += ref_stride;
143 src = __msa_aver_u_b(src, pred);
144 CALC_MSE_AVG_B(src, ref, var, avg);
145
146 pred = LD_UB(sec_pred);
147 sec_pred += 16;
148 src = LD_UB(src_ptr);
149 src_ptr += src_stride;
150 ref = LD_UB(ref_ptr);
151 ref_ptr += ref_stride;
152 src = __msa_aver_u_b(src, pred);
153 CALC_MSE_AVG_B(src, ref, var, avg);
154
155 pred = LD_UB(sec_pred);
156 sec_pred += 16;
157 src = LD_UB(src_ptr);
158 src_ptr += src_stride;
159 ref = LD_UB(ref_ptr);
160 ref_ptr += ref_stride;
161 src = __msa_aver_u_b(src, pred);
162 CALC_MSE_AVG_B(src, ref, var, avg);
163 }
164
165 vec = __msa_hadd_s_w(avg, avg);
166 *diff = HADD_SW_S32(vec);
167
168 return HADD_SW_S32(var);
169 }
170
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)171 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
172 int32_t src_stride,
173 const uint8_t *ref_ptr,
174 int32_t ref_stride,
175 const uint8_t *sec_pred,
176 int32_t height,
177 int32_t *diff) {
178 int32_t ht_cnt;
179 v16u8 src0, src1, ref0, ref1, pred0, pred1;
180 v8i16 avg = { 0 };
181 v4i32 vec, var = { 0 };
182
183 for (ht_cnt = (height >> 2); ht_cnt--;) {
184 LD_UB2(sec_pred, 16, pred0, pred1);
185 sec_pred += 32;
186 LD_UB2(src_ptr, 16, src0, src1);
187 src_ptr += src_stride;
188 LD_UB2(ref_ptr, 16, ref0, ref1);
189 ref_ptr += ref_stride;
190 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
191 CALC_MSE_AVG_B(src0, ref0, var, avg);
192 CALC_MSE_AVG_B(src1, ref1, var, avg);
193
194 LD_UB2(sec_pred, 16, pred0, pred1);
195 sec_pred += 32;
196 LD_UB2(src_ptr, 16, src0, src1);
197 src_ptr += src_stride;
198 LD_UB2(ref_ptr, 16, ref0, ref1);
199 ref_ptr += ref_stride;
200 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
201 CALC_MSE_AVG_B(src0, ref0, var, avg);
202 CALC_MSE_AVG_B(src1, ref1, var, avg);
203
204 LD_UB2(sec_pred, 16, pred0, pred1);
205 sec_pred += 32;
206 LD_UB2(src_ptr, 16, src0, src1);
207 src_ptr += src_stride;
208 LD_UB2(ref_ptr, 16, ref0, ref1);
209 ref_ptr += ref_stride;
210 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
211 CALC_MSE_AVG_B(src0, ref0, var, avg);
212 CALC_MSE_AVG_B(src1, ref1, var, avg);
213
214 LD_UB2(sec_pred, 16, pred0, pred1);
215 sec_pred += 32;
216 LD_UB2(src_ptr, 16, src0, src1);
217 src_ptr += src_stride;
218 LD_UB2(ref_ptr, 16, ref0, ref1);
219 ref_ptr += ref_stride;
220 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
221 CALC_MSE_AVG_B(src0, ref0, var, avg);
222 CALC_MSE_AVG_B(src1, ref1, var, avg);
223 }
224
225 vec = __msa_hadd_s_w(avg, avg);
226 *diff = HADD_SW_S32(vec);
227
228 return HADD_SW_S32(var);
229 }
230
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)231 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
232 int32_t src_stride,
233 const uint8_t *ref_ptr,
234 int32_t ref_stride,
235 const uint8_t *sec_pred,
236 int32_t *diff) {
237 int32_t ht_cnt;
238 v16u8 src0, src1, ref0, ref1, pred0, pred1;
239 v8i16 avg0 = { 0 };
240 v8i16 avg1 = { 0 };
241 v4i32 vec, var = { 0 };
242
243 for (ht_cnt = 16; ht_cnt--;) {
244 LD_UB2(sec_pred, 16, pred0, pred1);
245 sec_pred += 32;
246 LD_UB2(src_ptr, 16, src0, src1);
247 src_ptr += src_stride;
248 LD_UB2(ref_ptr, 16, ref0, ref1);
249 ref_ptr += ref_stride;
250 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
251 CALC_MSE_AVG_B(src0, ref0, var, avg0);
252 CALC_MSE_AVG_B(src1, ref1, var, avg1);
253
254 LD_UB2(sec_pred, 16, pred0, pred1);
255 sec_pred += 32;
256 LD_UB2(src_ptr, 16, src0, src1);
257 src_ptr += src_stride;
258 LD_UB2(ref_ptr, 16, ref0, ref1);
259 ref_ptr += ref_stride;
260 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
261 CALC_MSE_AVG_B(src0, ref0, var, avg0);
262 CALC_MSE_AVG_B(src1, ref1, var, avg1);
263
264 LD_UB2(sec_pred, 16, pred0, pred1);
265 sec_pred += 32;
266 LD_UB2(src_ptr, 16, src0, src1);
267 src_ptr += src_stride;
268 LD_UB2(ref_ptr, 16, ref0, ref1);
269 ref_ptr += ref_stride;
270 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
271 CALC_MSE_AVG_B(src0, ref0, var, avg0);
272 CALC_MSE_AVG_B(src1, ref1, var, avg1);
273
274 LD_UB2(sec_pred, 16, pred0, pred1);
275 sec_pred += 32;
276 LD_UB2(src_ptr, 16, src0, src1);
277 src_ptr += src_stride;
278 LD_UB2(ref_ptr, 16, ref0, ref1);
279 ref_ptr += ref_stride;
280 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
281 CALC_MSE_AVG_B(src0, ref0, var, avg0);
282 CALC_MSE_AVG_B(src1, ref1, var, avg1);
283 }
284
285 vec = __msa_hadd_s_w(avg0, avg0);
286 vec += __msa_hadd_s_w(avg1, avg1);
287 *diff = HADD_SW_S32(vec);
288
289 return HADD_SW_S32(var);
290 }
291
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)292 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
293 int32_t src_stride,
294 const uint8_t *ref_ptr,
295 int32_t ref_stride,
296 const uint8_t *sec_pred,
297 int32_t *diff) {
298 int32_t ht_cnt;
299 v16u8 src0, src1, src2, src3;
300 v16u8 ref0, ref1, ref2, ref3;
301 v16u8 pred0, pred1, pred2, pred3;
302 v8i16 avg0 = { 0 };
303 v8i16 avg1 = { 0 };
304 v4i32 vec, var = { 0 };
305
306 for (ht_cnt = 16; ht_cnt--;) {
307 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
308 sec_pred += 64;
309 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
310 src_ptr += src_stride;
311 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
312 ref_ptr += ref_stride;
313 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
314 src0, src1, src2, src3);
315 CALC_MSE_AVG_B(src0, ref0, var, avg0);
316 CALC_MSE_AVG_B(src2, ref2, var, avg0);
317 CALC_MSE_AVG_B(src1, ref1, var, avg1);
318 CALC_MSE_AVG_B(src3, ref3, var, avg1);
319
320 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
321 sec_pred += 64;
322 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
323 src_ptr += src_stride;
324 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
325 ref_ptr += ref_stride;
326 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
327 src0, src1, src2, src3);
328 CALC_MSE_AVG_B(src0, ref0, var, avg0);
329 CALC_MSE_AVG_B(src2, ref2, var, avg0);
330 CALC_MSE_AVG_B(src1, ref1, var, avg1);
331 CALC_MSE_AVG_B(src3, ref3, var, avg1);
332 }
333
334 vec = __msa_hadd_s_w(avg0, avg0);
335 vec += __msa_hadd_s_w(avg1, avg1);
336
337 *diff = HADD_SW_S32(vec);
338
339 return HADD_SW_S32(var);
340 }
341
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)342 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
343 int32_t src_stride,
344 const uint8_t *ref_ptr,
345 int32_t ref_stride,
346 const uint8_t *sec_pred,
347 int32_t *diff) {
348 int32_t ht_cnt;
349 v16u8 src0, src1, src2, src3;
350 v16u8 ref0, ref1, ref2, ref3;
351 v16u8 pred0, pred1, pred2, pred3;
352 v8i16 avg0 = { 0 };
353 v8i16 avg1 = { 0 };
354 v8i16 avg2 = { 0 };
355 v8i16 avg3 = { 0 };
356 v4i32 vec, var = { 0 };
357
358 for (ht_cnt = 32; ht_cnt--;) {
359 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
360 sec_pred += 64;
361 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
362 src_ptr += src_stride;
363 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
364 ref_ptr += ref_stride;
365 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
366 src0, src1, src2, src3);
367 CALC_MSE_AVG_B(src0, ref0, var, avg0);
368 CALC_MSE_AVG_B(src1, ref1, var, avg1);
369 CALC_MSE_AVG_B(src2, ref2, var, avg2);
370 CALC_MSE_AVG_B(src3, ref3, var, avg3);
371
372 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
373 sec_pred += 64;
374 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
375 src_ptr += src_stride;
376 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
377 ref_ptr += ref_stride;
378 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
379 src0, src1, src2, src3);
380 CALC_MSE_AVG_B(src0, ref0, var, avg0);
381 CALC_MSE_AVG_B(src1, ref1, var, avg1);
382 CALC_MSE_AVG_B(src2, ref2, var, avg2);
383 CALC_MSE_AVG_B(src3, ref3, var, avg3);
384 }
385
386 vec = __msa_hadd_s_w(avg0, avg0);
387 vec += __msa_hadd_s_w(avg1, avg1);
388 vec += __msa_hadd_s_w(avg2, avg2);
389 vec += __msa_hadd_s_w(avg3, avg3);
390 *diff = HADD_SW_S32(vec);
391
392 return HADD_SW_S32(var);
393 }
394
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)395 static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
396 int32_t src_stride,
397 const uint8_t *dst,
398 int32_t dst_stride,
399 const uint8_t *filter,
400 int32_t height,
401 int32_t *diff) {
402 int16_t filtval;
403 uint32_t loop_cnt;
404 uint32_t ref0, ref1, ref2, ref3;
405 v16u8 filt0, ref = { 0 };
406 v16i8 src0, src1, src2, src3;
407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
408 v8u16 vec0, vec1, vec2, vec3;
409 v8i16 avg = { 0 };
410 v4i32 vec, var = { 0 };
411
412 filtval = LH(filter);
413 filt0 = (v16u8)__msa_fill_h(filtval);
414
415 for (loop_cnt = (height >> 2); loop_cnt--;) {
416 LD_SB4(src, src_stride, src0, src1, src2, src3);
417 src += (4 * src_stride);
418 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
419 dst += (4 * dst_stride);
420 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
421 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
422 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
423 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
424 vec0, vec1, vec2, vec3);
425 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
426 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
427 src0, src1, src2, src3);
428 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
429 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
430 CALC_MSE_AVG_B(src0, ref, var, avg);
431 }
432
433 vec = __msa_hadd_s_w(avg, avg);
434 *diff = HADD_SW_S32(vec);
435
436 return HADD_SW_S32(var);
437 }
438
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)439 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
440 int32_t src_stride,
441 const uint8_t *dst,
442 int32_t dst_stride,
443 const uint8_t *filter,
444 int32_t height,
445 int32_t *diff) {
446 int16_t filtval;
447 uint32_t loop_cnt;
448 v16u8 filt0, out, ref0, ref1, ref2, ref3;
449 v16i8 src0, src1, src2, src3;
450 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
451 v8u16 vec0, vec1, vec2, vec3;
452 v8i16 avg = { 0 };
453 v4i32 vec, var = { 0 };
454
455 filtval = LH(filter);
456 filt0 = (v16u8)__msa_fill_h(filtval);
457
458 for (loop_cnt = (height >> 2); loop_cnt--;) {
459 LD_SB4(src, src_stride, src0, src1, src2, src3);
460 src += (4 * src_stride);
461 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
462 dst += (4 * dst_stride);
463
464 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
468 vec0, vec1, vec2, vec3);
469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
470 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
471 src0, src1, src2, src3);
472 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
473 CALC_MSE_AVG_B(out, ref0, var, avg);
474 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
475 CALC_MSE_AVG_B(out, ref1, var, avg);
476 }
477
478 vec = __msa_hadd_s_w(avg, avg);
479 *diff = HADD_SW_S32(vec);
480
481 return HADD_SW_S32(var);
482 }
483
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)484 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
485 int32_t src_stride,
486 const uint8_t *dst,
487 int32_t dst_stride,
488 const uint8_t *filter,
489 int32_t height,
490 int32_t *diff) {
491 int16_t filtval;
492 uint32_t loop_cnt;
493 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
494 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
495 v16u8 dst0, dst1, dst2, dst3, filt0;
496 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
497 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
498 v8i16 avg = { 0 };
499 v4i32 vec, var = { 0 };
500
501 filtval = LH(filter);
502 filt0 = (v16u8)__msa_fill_h(filtval);
503
504 for (loop_cnt = (height >> 2); loop_cnt--;) {
505 LD_SB4(src, src_stride, src0, src2, src4, src6);
506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
507 src += (4 * src_stride);
508 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
509 dst += (4 * dst_stride);
510
511 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
512 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
513 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
514 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
515 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
516 out0, out1, out2, out3);
517 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
518 out4, out5, out6, out7);
519 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
520 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
521 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
522 src0, src1, src2, src3);
523 CALC_MSE_AVG_B(src0, dst0, var, avg);
524 CALC_MSE_AVG_B(src1, dst1, var, avg);
525 CALC_MSE_AVG_B(src2, dst2, var, avg);
526 CALC_MSE_AVG_B(src3, dst3, var, avg);
527 }
528
529 vec = __msa_hadd_s_w(avg, avg);
530 *diff = HADD_SW_S32(vec);
531
532 return HADD_SW_S32(var);
533 }
534
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)535 static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
536 int32_t src_stride,
537 const uint8_t *dst,
538 int32_t dst_stride,
539 const uint8_t *filter,
540 int32_t height,
541 int32_t *diff) {
542 uint32_t loop_cnt, sse = 0;
543 int32_t diff0[2];
544
545 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
546 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
547 filter, height, &diff0[loop_cnt]);
548 src += 16;
549 dst += 16;
550 }
551
552 *diff = diff0[0] + diff0[1];
553
554 return sse;
555 }
556
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)557 static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
558 int32_t src_stride,
559 const uint8_t *dst,
560 int32_t dst_stride,
561 const uint8_t *filter,
562 int32_t height,
563 int32_t *diff) {
564 uint32_t loop_cnt, sse = 0;
565 int32_t diff0[4];
566
567 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
568 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
569 filter, height, &diff0[loop_cnt]);
570 src += 16;
571 dst += 16;
572 }
573
574 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
575
576 return sse;
577 }
578
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)579 static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
580 int32_t src_stride,
581 const uint8_t *dst,
582 int32_t dst_stride,
583 const uint8_t *filter,
584 int32_t height,
585 int32_t *diff) {
586 int16_t filtval;
587 uint32_t loop_cnt;
588 uint32_t ref0, ref1, ref2, ref3;
589 v16u8 src0, src1, src2, src3, src4, out;
590 v16u8 src10_r, src32_r, src21_r, src43_r;
591 v16u8 ref = { 0 };
592 v16u8 src2110, src4332;
593 v16u8 filt0;
594 v8i16 avg = { 0 };
595 v4i32 vec, var = { 0 };
596 v8u16 tmp0, tmp1;
597
598 filtval = LH(filter);
599 filt0 = (v16u8)__msa_fill_h(filtval);
600
601 src0 = LD_UB(src);
602 src += src_stride;
603
604 for (loop_cnt = (height >> 2); loop_cnt--;) {
605 LD_UB4(src, src_stride, src1, src2, src3, src4);
606 src += (4 * src_stride);
607 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
608 dst += (4 * dst_stride);
609
610 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
611 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
612 src10_r, src21_r, src32_r, src43_r);
613 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
614 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
615 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
616 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
617 CALC_MSE_AVG_B(out, ref, var, avg);
618 src0 = src4;
619 }
620
621 vec = __msa_hadd_s_w(avg, avg);
622 *diff = HADD_SW_S32(vec);
623
624 return HADD_SW_S32(var);
625 }
626
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)627 static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
628 int32_t src_stride,
629 const uint8_t *dst,
630 int32_t dst_stride,
631 const uint8_t *filter,
632 int32_t height,
633 int32_t *diff) {
634 int16_t filtval;
635 uint32_t loop_cnt;
636 v16u8 src0, src1, src2, src3, src4;
637 v16u8 ref0, ref1, ref2, ref3;
638 v8u16 vec0, vec1, vec2, vec3;
639 v8u16 tmp0, tmp1, tmp2, tmp3;
640 v16u8 filt0;
641 v8i16 avg = { 0 };
642 v4i32 vec, var = { 0 };
643
644 filtval = LH(filter);
645 filt0 = (v16u8)__msa_fill_h(filtval);
646
647 src0 = LD_UB(src);
648 src += src_stride;
649
650 for (loop_cnt = (height >> 2); loop_cnt--;) {
651 LD_UB4(src, src_stride, src1, src2, src3, src4);
652 src += (4 * src_stride);
653 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
654 dst += (4 * dst_stride);
655
656 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
657 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
658 vec0, vec1, vec2, vec3);
659 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
660 tmp0, tmp1, tmp2, tmp3);
661 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
662 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
663 CALC_MSE_AVG_B(src0, ref0, var, avg);
664 CALC_MSE_AVG_B(src1, ref1, var, avg);
665 src0 = src4;
666 }
667
668 vec = __msa_hadd_s_w(avg, avg);
669 *diff = HADD_SW_S32(vec);
670
671 return HADD_SW_S32(var);
672 }
673
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)674 static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
675 int32_t src_stride,
676 const uint8_t *dst,
677 int32_t dst_stride,
678 const uint8_t *filter,
679 int32_t height,
680 int32_t *diff) {
681 int16_t filtval;
682 uint32_t loop_cnt;
683 v16u8 ref0, ref1, ref2, ref3;
684 v16u8 src0, src1, src2, src3, src4;
685 v16u8 out0, out1, out2, out3;
686 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
687 v8u16 tmp0, tmp1, tmp2, tmp3;
688 v16u8 filt0;
689 v8i16 avg = { 0 };
690 v4i32 vec, var = { 0 };
691
692 filtval = LH(filter);
693 filt0 = (v16u8)__msa_fill_h(filtval);
694
695 src0 = LD_UB(src);
696 src += src_stride;
697
698 for (loop_cnt = (height >> 2); loop_cnt--;) {
699 LD_UB4(src, src_stride, src1, src2, src3, src4);
700 src += (4 * src_stride);
701 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
702 dst += (4 * dst_stride);
703
704 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
705 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
706 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
707 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
708 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
709
710 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
711 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
712 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
713 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
714 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
715
716 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
717 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
718 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
719 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
720 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
721 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
722
723 src0 = src4;
724
725 CALC_MSE_AVG_B(out0, ref0, var, avg);
726 CALC_MSE_AVG_B(out1, ref1, var, avg);
727 CALC_MSE_AVG_B(out2, ref2, var, avg);
728 CALC_MSE_AVG_B(out3, ref3, var, avg);
729 }
730
731 vec = __msa_hadd_s_w(avg, avg);
732 *diff = HADD_SW_S32(vec);
733
734 return HADD_SW_S32(var);
735 }
736
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)737 static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
738 int32_t src_stride,
739 const uint8_t *dst,
740 int32_t dst_stride,
741 const uint8_t *filter,
742 int32_t height,
743 int32_t *diff) {
744 uint32_t loop_cnt, sse = 0;
745 int32_t diff0[2];
746
747 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
748 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
749 filter, height, &diff0[loop_cnt]);
750 src += 16;
751 dst += 16;
752 }
753
754 *diff = diff0[0] + diff0[1];
755
756 return sse;
757 }
758
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)759 static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
760 int32_t src_stride,
761 const uint8_t *dst,
762 int32_t dst_stride,
763 const uint8_t *filter,
764 int32_t height,
765 int32_t *diff) {
766 uint32_t loop_cnt, sse = 0;
767 int32_t diff0[4];
768
769 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
770 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
771 filter, height, &diff0[loop_cnt]);
772 src += 16;
773 dst += 16;
774 }
775
776 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
777
778 return sse;
779 }
780
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)781 static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
782 int32_t src_stride,
783 const uint8_t *dst,
784 int32_t dst_stride,
785 const uint8_t *filter_horiz,
786 const uint8_t *filter_vert,
787 int32_t height,
788 int32_t *diff) {
789 int16_t filtval;
790 uint32_t loop_cnt;
791 uint32_t ref0, ref1, ref2, ref3;
792 v16u8 src0, src1, src2, src3, src4;
793 v16u8 out, ref = { 0 };
794 v16u8 filt_vt, filt_hz, vec0, vec1;
795 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
796 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
797 v8u16 tmp0, tmp1;
798 v8i16 avg = { 0 };
799 v4i32 vec, var = { 0 };
800
801 filtval = LH(filter_horiz);
802 filt_hz = (v16u8)__msa_fill_h(filtval);
803 filtval = LH(filter_vert);
804 filt_vt = (v16u8)__msa_fill_h(filtval);
805
806 src0 = LD_UB(src);
807 src += src_stride;
808
809 for (loop_cnt = (height >> 2); loop_cnt--;) {
810 LD_UB4(src, src_stride, src1, src2, src3, src4);
811 src += (4 * src_stride);
812 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
813 dst += (4 * dst_stride);
814 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
815 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
816 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
817 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
818 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
819 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
820 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
821 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
822 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
823 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
824 CALC_MSE_AVG_B(out, ref, var, avg);
825 src0 = src4;
826 }
827
828 vec = __msa_hadd_s_w(avg, avg);
829 *diff = HADD_SW_S32(vec);
830
831 return HADD_SW_S32(var);
832 }
833
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)834 static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
835 int32_t src_stride,
836 const uint8_t *dst,
837 int32_t dst_stride,
838 const uint8_t *filter_horiz,
839 const uint8_t *filter_vert,
840 int32_t height,
841 int32_t *diff) {
842 int16_t filtval;
843 uint32_t loop_cnt;
844 v16u8 ref0, ref1, ref2, ref3;
845 v16u8 src0, src1, src2, src3, src4;
846 v16u8 out0, out1;
847 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
848 v8u16 hz_out0, hz_out1;
849 v8u16 tmp0, tmp1, tmp2, tmp3;
850 v16u8 filt_vt, filt_hz, vec0;
851 v8i16 avg = { 0 };
852 v4i32 vec, var = { 0 };
853
854 filtval = LH(filter_horiz);
855 filt_hz = (v16u8)__msa_fill_h(filtval);
856 filtval = LH(filter_vert);
857 filt_vt = (v16u8)__msa_fill_h(filtval);
858
859 src0 = LD_UB(src);
860 src += src_stride;
861 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
862
863 for (loop_cnt = (height >> 2); loop_cnt--;) {
864 LD_UB4(src, src_stride, src1, src2, src3, src4);
865 src += (4 * src_stride);
866 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
867 dst += (4 * dst_stride);
868
869 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
870 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
871 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
872 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
873 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
874 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
875 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
877 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
878 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
879 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
880 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
881 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
882 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
883 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
884 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
885 CALC_MSE_AVG_B(out0, ref0, var, avg);
886 CALC_MSE_AVG_B(out1, ref1, var, avg);
887 }
888
889 vec = __msa_hadd_s_w(avg, avg);
890 *diff = HADD_SW_S32(vec);
891
892 return HADD_SW_S32(var);
893 }
894
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)895 static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
896 int32_t src_stride,
897 const uint8_t *dst,
898 int32_t dst_stride,
899 const uint8_t *filter_horiz,
900 const uint8_t *filter_vert,
901 int32_t height,
902 int32_t *diff) {
903 int16_t filtval;
904 uint32_t loop_cnt;
905 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
906 v16u8 ref0, ref1, ref2, ref3;
907 v16u8 filt_hz, filt_vt, vec0, vec1;
908 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
909 v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
910 v8u16 tmp0, tmp1;
911 v8i16 avg = { 0 };
912 v4i32 vec, var = { 0 };
913
914 filtval = LH(filter_horiz);
915 filt_hz = (v16u8)__msa_fill_h(filtval);
916 filtval = LH(filter_vert);
917 filt_vt = (v16u8)__msa_fill_h(filtval);
918
919 LD_UB2(src, 8, src0, src1);
920 src += src_stride;
921
922 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
923 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
924
925 for (loop_cnt = (height >> 2); loop_cnt--;) {
926 LD_UB4(src, src_stride, src0, src2, src4, src6);
927 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
928 src += (4 * src_stride);
929 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
930 dst += (4 * dst_stride);
931
932 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
933 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
934 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
935 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
936 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
937 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
938
939 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
940 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
941 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
942 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
943 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
944 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
945
946 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
947 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
948 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
949 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
950 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
951 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
952
953 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
954 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
955 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
958 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
959
960 CALC_MSE_AVG_B(src0, ref0, var, avg);
961 CALC_MSE_AVG_B(src1, ref1, var, avg);
962 CALC_MSE_AVG_B(src2, ref2, var, avg);
963 CALC_MSE_AVG_B(src3, ref3, var, avg);
964 }
965
966 vec = __msa_hadd_s_w(avg, avg);
967 *diff = HADD_SW_S32(vec);
968
969 return HADD_SW_S32(var);
970 }
971
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)972 static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
973 int32_t src_stride,
974 const uint8_t *dst,
975 int32_t dst_stride,
976 const uint8_t *filter_horiz,
977 const uint8_t *filter_vert,
978 int32_t height,
979 int32_t *diff) {
980 uint32_t loop_cnt, sse = 0;
981 int32_t diff0[2];
982
983 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
984 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
985 filter_horiz, filter_vert, height,
986 &diff0[loop_cnt]);
987 src += 16;
988 dst += 16;
989 }
990
991 *diff = diff0[0] + diff0[1];
992
993 return sse;
994 }
995
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)996 static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
997 int32_t src_stride,
998 const uint8_t *dst,
999 int32_t dst_stride,
1000 const uint8_t *filter_horiz,
1001 const uint8_t *filter_vert,
1002 int32_t height,
1003 int32_t *diff) {
1004 uint32_t loop_cnt, sse = 0;
1005 int32_t diff0[4];
1006
1007 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1008 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
1009 filter_horiz, filter_vert, height,
1010 &diff0[loop_cnt]);
1011 src += 16;
1012 dst += 16;
1013 }
1014
1015 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1016
1017 return sse;
1018 }
1019
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1020 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
1021 int32_t src_stride,
1022 const uint8_t *dst,
1023 int32_t dst_stride,
1024 const uint8_t *sec_pred,
1025 const uint8_t *filter,
1026 int32_t height,
1027 int32_t *diff) {
1028 int16_t filtval;
1029 uint32_t loop_cnt;
1030 uint32_t ref0, ref1, ref2, ref3;
1031 v16u8 out, pred, filt0, ref = { 0 };
1032 v16i8 src0, src1, src2, src3;
1033 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1034 v8u16 vec0, vec1, vec2, vec3;
1035 v8i16 avg = { 0 };
1036 v4i32 vec, var = { 0 };
1037
1038 filtval = LH(filter);
1039 filt0 = (v16u8)__msa_fill_h(filtval);
1040
1041 for (loop_cnt = (height >> 2); loop_cnt--;) {
1042 LD_SB4(src, src_stride, src0, src1, src2, src3);
1043 src += (4 * src_stride);
1044 pred = LD_UB(sec_pred);
1045 sec_pred += 16;
1046 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1047 dst += (4 * dst_stride);
1048
1049 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1050 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1051 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1052 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1053 vec0, vec1, vec2, vec3);
1054 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1055 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1056 src0, src1, src2, src3);
1057 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
1058 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
1059 out = __msa_aver_u_b(out, pred);
1060 CALC_MSE_AVG_B(out, ref, var, avg);
1061 }
1062
1063 vec = __msa_hadd_s_w(avg, avg);
1064 *diff = HADD_SW_S32(vec);
1065
1066 return HADD_SW_S32(var);
1067 }
1068
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1069 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
1070 int32_t src_stride,
1071 const uint8_t *dst,
1072 int32_t dst_stride,
1073 const uint8_t *sec_pred,
1074 const uint8_t *filter,
1075 int32_t height,
1076 int32_t *diff) {
1077 int16_t filtval;
1078 uint32_t loop_cnt;
1079 v16u8 out, pred, filt0;
1080 v16u8 ref0, ref1, ref2, ref3;
1081 v16i8 src0, src1, src2, src3;
1082 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1083 v8u16 vec0, vec1, vec2, vec3;
1084 v8i16 avg = { 0 };
1085 v4i32 vec, var = { 0 };
1086
1087 filtval = LH(filter);
1088 filt0 = (v16u8)__msa_fill_h(filtval);
1089
1090 for (loop_cnt = (height >> 2); loop_cnt--;) {
1091 LD_SB4(src, src_stride, src0, src1, src2, src3);
1092 src += (4 * src_stride);
1093 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1094 dst += (4 * dst_stride);
1095
1096 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1097 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1098 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1099 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1100 vec0, vec1, vec2, vec3);
1101 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1102 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1103 src0, src1, src2, src3);
1104 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1105
1106 pred = LD_UB(sec_pred);
1107 sec_pred += 16;
1108 out = __msa_aver_u_b(out, pred);
1109 CALC_MSE_AVG_B(out, ref0, var, avg);
1110 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1111 pred = LD_UB(sec_pred);
1112 sec_pred += 16;
1113 out = __msa_aver_u_b(out, pred);
1114 CALC_MSE_AVG_B(out, ref1, var, avg);
1115 }
1116
1117 vec = __msa_hadd_s_w(avg, avg);
1118 *diff = HADD_SW_S32(vec);
1119
1120 return HADD_SW_S32(var);
1121 }
1122
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1123 static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
1124 int32_t src_stride,
1125 const uint8_t *dst,
1126 int32_t dst_stride,
1127 const uint8_t *sec_pred,
1128 const uint8_t *filter,
1129 int32_t height,
1130 int32_t *diff,
1131 int32_t width) {
1132 int16_t filtval;
1133 uint32_t loop_cnt;
1134 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1135 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1136 v16u8 dst0, dst1, dst2, dst3;
1137 v16u8 tmp0, tmp1, tmp2, tmp3;
1138 v16u8 pred0, pred1, pred2, pred3, filt0;
1139 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1140 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1141 v8i16 avg = { 0 };
1142 v4i32 vec, var = { 0 };
1143
1144 filtval = LH(filter);
1145 filt0 = (v16u8)__msa_fill_h(filtval);
1146
1147 for (loop_cnt = (height >> 2); loop_cnt--;) {
1148 LD_SB4(src, src_stride, src0, src2, src4, src6);
1149 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1150 src += (4 * src_stride);
1151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1152 dst += (4 * dst_stride);
1153 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1154 sec_pred += (4 * width);
1155
1156 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1157 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1158 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1159 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1160 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1161 out0, out1, out2, out3);
1162 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1163 out4, out5, out6, out7);
1164 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1165 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1166 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
1167 tmp0, tmp1, tmp2, tmp3);
1168 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
1169 tmp0, tmp1, tmp2, tmp3);
1170
1171 CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1172 CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1173 CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1174 CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1175 }
1176
1177 vec = __msa_hadd_s_w(avg, avg);
1178 *diff = HADD_SW_S32(vec);
1179
1180 return HADD_SW_S32(var);
1181 }
1182
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1183 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
1184 int32_t src_stride,
1185 const uint8_t *dst,
1186 int32_t dst_stride,
1187 const uint8_t *sec_pred,
1188 const uint8_t *filter,
1189 int32_t height,
1190 int32_t *diff) {
1191 return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1192 sec_pred, filter, height, diff, 16);
1193 }
1194
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1195 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
1196 int32_t src_stride,
1197 const uint8_t *dst,
1198 int32_t dst_stride,
1199 const uint8_t *sec_pred,
1200 const uint8_t *filter,
1201 int32_t height,
1202 int32_t *diff) {
1203 uint32_t loop_cnt, sse = 0;
1204 int32_t diff0[2];
1205
1206 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1207 sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1208 sec_pred, filter, height,
1209 &diff0[loop_cnt], 32);
1210 src += 16;
1211 dst += 16;
1212 sec_pred += 16;
1213 }
1214
1215 *diff = diff0[0] + diff0[1];
1216
1217 return sse;
1218 }
1219
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1220 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
1221 int32_t src_stride,
1222 const uint8_t *dst,
1223 int32_t dst_stride,
1224 const uint8_t *sec_pred,
1225 const uint8_t *filter,
1226 int32_t height,
1227 int32_t *diff) {
1228 uint32_t loop_cnt, sse = 0;
1229 int32_t diff0[4];
1230
1231 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1232 sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1233 sec_pred, filter, height,
1234 &diff0[loop_cnt], 64);
1235 src += 16;
1236 dst += 16;
1237 sec_pred += 16;
1238 }
1239
1240 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1241
1242 return sse;
1243 }
1244
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1245 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
1246 int32_t src_stride,
1247 const uint8_t *dst,
1248 int32_t dst_stride,
1249 const uint8_t *sec_pred,
1250 const uint8_t *filter,
1251 int32_t height,
1252 int32_t *diff) {
1253 int16_t filtval;
1254 uint32_t loop_cnt;
1255 uint32_t ref0, ref1, ref2, ref3;
1256 v16u8 src0, src1, src2, src3, src4;
1257 v16u8 src10_r, src32_r, src21_r, src43_r;
1258 v16u8 out, pred, ref = { 0 };
1259 v16u8 src2110, src4332, filt0;
1260 v8i16 avg = { 0 };
1261 v4i32 vec, var = { 0 };
1262 v8u16 tmp0, tmp1;
1263
1264 filtval = LH(filter);
1265 filt0 = (v16u8)__msa_fill_h(filtval);
1266
1267 src0 = LD_UB(src);
1268 src += src_stride;
1269
1270 for (loop_cnt = (height >> 2); loop_cnt--;) {
1271 LD_UB4(src, src_stride, src1, src2, src3, src4);
1272 src += (4 * src_stride);
1273 pred = LD_UB(sec_pred);
1274 sec_pred += 16;
1275 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1276 dst += (4 * dst_stride);
1277
1278 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1279 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1280 src10_r, src21_r, src32_r, src43_r);
1281 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1282 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1284
1285 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1286 out = __msa_aver_u_b(out, pred);
1287 CALC_MSE_AVG_B(out, ref, var, avg);
1288 src0 = src4;
1289 }
1290
1291 vec = __msa_hadd_s_w(avg, avg);
1292 *diff = HADD_SW_S32(vec);
1293
1294 return HADD_SW_S32(var);
1295 }
1296
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1297 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
1298 int32_t src_stride,
1299 const uint8_t *dst,
1300 int32_t dst_stride,
1301 const uint8_t *sec_pred,
1302 const uint8_t *filter,
1303 int32_t height,
1304 int32_t *diff) {
1305 int16_t filtval;
1306 uint32_t loop_cnt;
1307 v16u8 src0, src1, src2, src3, src4;
1308 v16u8 ref0, ref1, ref2, ref3;
1309 v16u8 pred0, pred1, filt0;
1310 v8u16 vec0, vec1, vec2, vec3;
1311 v8u16 tmp0, tmp1, tmp2, tmp3;
1312 v8i16 avg = { 0 };
1313 v4i32 vec, var = { 0 };
1314
1315 filtval = LH(filter);
1316 filt0 = (v16u8)__msa_fill_h(filtval);
1317
1318 src0 = LD_UB(src);
1319 src += src_stride;
1320
1321 for (loop_cnt = (height >> 2); loop_cnt--;) {
1322 LD_UB4(src, src_stride, src1, src2, src3, src4);
1323 src += (4 * src_stride);
1324 LD_UB2(sec_pred, 16, pred0, pred1);
1325 sec_pred += 32;
1326 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1327 dst += (4 * dst_stride);
1328 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1329 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
1330 vec0, vec1, vec2, vec3);
1331 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1332 tmp0, tmp1, tmp2, tmp3);
1333 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1334 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1335 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1336 CALC_MSE_AVG_B(src0, ref0, var, avg);
1337 CALC_MSE_AVG_B(src1, ref1, var, avg);
1338
1339 src0 = src4;
1340 }
1341
1342 vec = __msa_hadd_s_w(avg, avg);
1343 *diff = HADD_SW_S32(vec);
1344
1345 return HADD_SW_S32(var);
1346 }
1347
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1348 static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
1349 int32_t src_stride,
1350 const uint8_t *dst,
1351 int32_t dst_stride,
1352 const uint8_t *sec_pred,
1353 const uint8_t *filter,
1354 int32_t height,
1355 int32_t *diff,
1356 int32_t width) {
1357 int16_t filtval;
1358 uint32_t loop_cnt;
1359 v16u8 ref0, ref1, ref2, ref3;
1360 v16u8 pred0, pred1, pred2, pred3;
1361 v16u8 src0, src1, src2, src3, src4;
1362 v16u8 out0, out1, out2, out3, filt0;
1363 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1364 v8u16 tmp0, tmp1, tmp2, tmp3;
1365 v8i16 avg = { 0 };
1366 v4i32 vec, var = { 0 };
1367
1368 filtval = LH(filter);
1369 filt0 = (v16u8)__msa_fill_h(filtval);
1370
1371 src0 = LD_UB(src);
1372 src += src_stride;
1373
1374 for (loop_cnt = (height >> 2); loop_cnt--;) {
1375 LD_UB4(src, src_stride, src1, src2, src3, src4);
1376 src += (4 * src_stride);
1377 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1378 sec_pred += (4 * width);
1379
1380 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1381 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1382 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1383 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1384 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1385
1386 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1387 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1388 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1389 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1390 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1391
1392 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1393 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1394 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1395
1396 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1397 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1398 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1399
1400 src0 = src4;
1401 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1402 dst += (4 * dst_stride);
1403
1404 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1405 out0, out1, out2, out3);
1406
1407 CALC_MSE_AVG_B(out0, ref0, var, avg);
1408 CALC_MSE_AVG_B(out1, ref1, var, avg);
1409 CALC_MSE_AVG_B(out2, ref2, var, avg);
1410 CALC_MSE_AVG_B(out3, ref3, var, avg);
1411 }
1412
1413 vec = __msa_hadd_s_w(avg, avg);
1414 *diff = HADD_SW_S32(vec);
1415
1416 return HADD_SW_S32(var);
1417 }
1418
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1419 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
1420 int32_t src_stride,
1421 const uint8_t *dst,
1422 int32_t dst_stride,
1423 const uint8_t *sec_pred,
1424 const uint8_t *filter,
1425 int32_t height,
1426 int32_t *diff) {
1427 return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1428 sec_pred, filter, height, diff, 16);
1429 }
1430
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1431 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
1432 int32_t src_stride,
1433 const uint8_t *dst,
1434 int32_t dst_stride,
1435 const uint8_t *sec_pred,
1436 const uint8_t *filter,
1437 int32_t height,
1438 int32_t *diff) {
1439 uint32_t loop_cnt, sse = 0;
1440 int32_t diff0[2];
1441
1442 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1443 sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1444 sec_pred, filter, height,
1445 &diff0[loop_cnt], 32);
1446 src += 16;
1447 dst += 16;
1448 sec_pred += 16;
1449 }
1450
1451 *diff = diff0[0] + diff0[1];
1452
1453 return sse;
1454 }
1455
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1456 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
1457 int32_t src_stride,
1458 const uint8_t *dst,
1459 int32_t dst_stride,
1460 const uint8_t *sec_pred,
1461 const uint8_t *filter,
1462 int32_t height,
1463 int32_t *diff) {
1464 uint32_t loop_cnt, sse = 0;
1465 int32_t diff0[4];
1466
1467 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1468 sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1469 sec_pred, filter, height,
1470 &diff0[loop_cnt], 64);
1471 src += 16;
1472 dst += 16;
1473 sec_pred += 16;
1474 }
1475
1476 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1477
1478 return sse;
1479 }
1480
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1481 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1482 const uint8_t *src, int32_t src_stride,
1483 const uint8_t *dst, int32_t dst_stride,
1484 const uint8_t *sec_pred,
1485 const uint8_t *filter_horiz, const uint8_t *filter_vert,
1486 int32_t height, int32_t *diff) {
1487 int16_t filtval;
1488 uint32_t loop_cnt;
1489 uint32_t ref0, ref1, ref2, ref3;
1490 v16u8 src0, src1, src2, src3, src4;
1491 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1492 v16u8 filt_hz, filt_vt, vec0, vec1;
1493 v16u8 out, pred, ref = { 0 };
1494 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1495 v8i16 avg = { 0 };
1496 v4i32 vec, var = { 0 };
1497
1498 filtval = LH(filter_horiz);
1499 filt_hz = (v16u8)__msa_fill_h(filtval);
1500 filtval = LH(filter_vert);
1501 filt_vt = (v16u8)__msa_fill_h(filtval);
1502
1503 src0 = LD_UB(src);
1504 src += src_stride;
1505
1506 for (loop_cnt = (height >> 2); loop_cnt--;) {
1507 LD_UB4(src, src_stride, src1, src2, src3, src4);
1508 src += (4 * src_stride);
1509 pred = LD_UB(sec_pred);
1510 sec_pred += 16;
1511 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1512 dst += (4 * dst_stride);
1513 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1514 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1515 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1516 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1517 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1518 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1519 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1522 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1523 out = __msa_aver_u_b(out, pred);
1524 CALC_MSE_AVG_B(out, ref, var, avg);
1525 src0 = src4;
1526 }
1527
1528 vec = __msa_hadd_s_w(avg, avg);
1529 *diff = HADD_SW_S32(vec);
1530
1531 return HADD_SW_S32(var);
1532 }
1533
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1534 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1535 const uint8_t *src, int32_t src_stride,
1536 const uint8_t *dst, int32_t dst_stride,
1537 const uint8_t *sec_pred,
1538 const uint8_t *filter_horiz, const uint8_t *filter_vert,
1539 int32_t height, int32_t *diff) {
1540 int16_t filtval;
1541 uint32_t loop_cnt;
1542 v16u8 ref0, ref1, ref2, ref3;
1543 v16u8 src0, src1, src2, src3, src4;
1544 v16u8 pred0, pred1, out0, out1;
1545 v16u8 filt_hz, filt_vt, vec0;
1546 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1547 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1548 v8i16 avg = { 0 };
1549 v4i32 vec, var = { 0 };
1550
1551 filtval = LH(filter_horiz);
1552 filt_hz = (v16u8)__msa_fill_h(filtval);
1553 filtval = LH(filter_vert);
1554 filt_vt = (v16u8)__msa_fill_h(filtval);
1555
1556 src0 = LD_UB(src);
1557 src += src_stride;
1558 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1559
1560 for (loop_cnt = (height >> 2); loop_cnt--;) {
1561 LD_UB4(src, src_stride, src1, src2, src3, src4);
1562 src += (4 * src_stride);
1563 LD_UB2(sec_pred, 16, pred0, pred1);
1564 sec_pred += 32;
1565 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1566 dst += (4 * dst_stride);
1567
1568 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1569 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1570
1571 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1572 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1573 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1574
1575 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1576 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1577 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1578 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1579
1580 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1581 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1582 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1583
1584 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1585 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1586
1587 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1588 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1589 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1590
1591 CALC_MSE_AVG_B(out0, ref0, var, avg);
1592 CALC_MSE_AVG_B(out1, ref1, var, avg);
1593 }
1594
1595 vec = __msa_hadd_s_w(avg, avg);
1596 *diff = HADD_SW_S32(vec);
1597
1598 return HADD_SW_S32(var);
1599 }
1600
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1601 static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
1602 int32_t src_stride,
1603 const uint8_t *dst,
1604 int32_t dst_stride,
1605 const uint8_t *sec_pred,
1606 const uint8_t *filter_horiz,
1607 const uint8_t *filter_vert,
1608 int32_t height,
1609 int32_t *diff,
1610 int32_t width) {
1611 int16_t filtval;
1612 uint32_t loop_cnt;
1613 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1614 v16u8 ref0, ref1, ref2, ref3;
1615 v16u8 pred0, pred1, pred2, pred3;
1616 v16u8 out0, out1, out2, out3;
1617 v16u8 filt_hz, filt_vt, vec0, vec1;
1618 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1619 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1620 v8i16 avg = { 0 };
1621 v4i32 vec, var = { 0 };
1622
1623 filtval = LH(filter_horiz);
1624 filt_hz = (v16u8)__msa_fill_h(filtval);
1625 filtval = LH(filter_vert);
1626 filt_vt = (v16u8)__msa_fill_h(filtval);
1627
1628 LD_UB2(src, 8, src0, src1);
1629 src += src_stride;
1630
1631 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1632 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1633
1634 for (loop_cnt = (height >> 2); loop_cnt--;) {
1635 LD_UB4(src, src_stride, src0, src2, src4, src6);
1636 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1637 src += (4 * src_stride);
1638 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1639 sec_pred += (4 * width);
1640
1641 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1642 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1643 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1644 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1645 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1646 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1647
1648 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1649 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1650 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1651 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1652 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1653 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1654
1655 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1656 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1657 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1658 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1659 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1660 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1661
1662 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1663 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1664 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1665 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1666 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1667 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1668
1669 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1670 dst += (4 * dst_stride);
1671
1672 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1673 out0, out1, out2, out3);
1674
1675 CALC_MSE_AVG_B(out0, ref0, var, avg);
1676 CALC_MSE_AVG_B(out1, ref1, var, avg);
1677 CALC_MSE_AVG_B(out2, ref2, var, avg);
1678 CALC_MSE_AVG_B(out3, ref3, var, avg);
1679 }
1680
1681 vec = __msa_hadd_s_w(avg, avg);
1682 *diff = HADD_SW_S32(vec);
1683
1684 return HADD_SW_S32(var);
1685 }
1686
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1687 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1688 const uint8_t *src, int32_t src_stride,
1689 const uint8_t *dst, int32_t dst_stride,
1690 const uint8_t *sec_pred,
1691 const uint8_t *filter_horiz, const uint8_t *filter_vert,
1692 int32_t height, int32_t *diff) {
1693 return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1694 sec_pred, filter_horiz, filter_vert,
1695 height, diff, 16);
1696 }
1697
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1698 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1699 const uint8_t *src, int32_t src_stride,
1700 const uint8_t *dst, int32_t dst_stride,
1701 const uint8_t *sec_pred,
1702 const uint8_t *filter_horiz, const uint8_t *filter_vert,
1703 int32_t height, int32_t *diff) {
1704 uint32_t loop_cnt, sse = 0;
1705 int32_t diff0[2];
1706
1707 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1708 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1709 sec_pred, filter_horiz, filter_vert,
1710 height, &diff0[loop_cnt], 32);
1711 src += 16;
1712 dst += 16;
1713 sec_pred += 16;
1714 }
1715
1716 *diff = diff0[0] + diff0[1];
1717
1718 return sse;
1719 }
1720
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1721 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1722 const uint8_t *src, int32_t src_stride,
1723 const uint8_t *dst, int32_t dst_stride,
1724 const uint8_t *sec_pred,
1725 const uint8_t *filter_horiz, const uint8_t *filter_vert,
1726 int32_t height, int32_t *diff) {
1727 uint32_t loop_cnt, sse = 0;
1728 int32_t diff0[4];
1729
1730 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1731 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1732 sec_pred, filter_horiz, filter_vert,
1733 height, &diff0[loop_cnt], 64);
1734 src += 16;
1735 dst += 16;
1736 sec_pred += 16;
1737 }
1738
1739 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1740
1741 return sse;
1742 }
1743
1744 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1745 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1746 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1747 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1748 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1749 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1750 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1751
1752 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1753 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1754 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1755 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1756 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1757 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1758
1759 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
1760 uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
1761 int32_t src_stride, \
1762 int32_t xoffset, \
1763 int32_t yoffset, \
1764 const uint8_t *ref, \
1765 int32_t ref_stride, \
1766 uint32_t *sse) { \
1767 int32_t diff; \
1768 uint32_t var; \
1769 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
1770 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
1771 \
1772 if (yoffset) { \
1773 if (xoffset) { \
1774 *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \
1775 ref, ref_stride, \
1776 h_filter, v_filter, \
1777 ht, &diff); \
1778 } else { \
1779 *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \
1780 ref, ref_stride, \
1781 v_filter, ht, &diff); \
1782 } \
1783 \
1784 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1785 } else { \
1786 if (xoffset) { \
1787 *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \
1788 ref, ref_stride, \
1789 h_filter, ht, &diff); \
1790 \
1791 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1792 } else { \
1793 var = vpx_variance##wd##x##ht##_msa(src, src_stride, \
1794 ref, ref_stride, sse); \
1795 } \
1796 } \
1797 \
1798 return var; \
1799 }
1800
1801 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1802 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1803
1804 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1805 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1806 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1807
1808 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1809 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1810 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1811
1812 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1813 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1814 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1815
1816 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1817 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1818
1819 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
1820 uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
1821 const uint8_t *src_ptr, int32_t src_stride, \
1822 int32_t xoffset, int32_t yoffset, \
1823 const uint8_t *ref_ptr, int32_t ref_stride, \
1824 uint32_t *sse, const uint8_t *sec_pred) { \
1825 int32_t diff; \
1826 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
1827 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
1828 \
1829 if (yoffset) { \
1830 if (xoffset) { \
1831 *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \
1832 ref_ptr, ref_stride, \
1833 sec_pred, h_filter, \
1834 v_filter, ht, &diff); \
1835 } else { \
1836 *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \
1837 ref_ptr, ref_stride, \
1838 sec_pred, v_filter, \
1839 ht, &diff); \
1840 } \
1841 } else { \
1842 if (xoffset) { \
1843 *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \
1844 ref_ptr, ref_stride, \
1845 sec_pred, h_filter, \
1846 ht, &diff); \
1847 } else { \
1848 *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \
1849 ref_ptr, ref_stride, \
1850 sec_pred, ht, &diff); \
1851 } \
1852 } \
1853 \
1854 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1855 }
1856
1857 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1858 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1859
1860 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1861 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1862 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1863
1864 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1865 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1866 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1867
1868 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1869 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1870
vpx_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t xoffset,int32_t yoffset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1871 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1872 int32_t src_stride,
1873 int32_t xoffset,
1874 int32_t yoffset,
1875 const uint8_t *ref_ptr,
1876 int32_t ref_stride,
1877 uint32_t *sse,
1878 const uint8_t *sec_pred) {
1879 int32_t diff;
1880 const uint8_t *h_filter = bilinear_filters_msa[xoffset];
1881 const uint8_t *v_filter = bilinear_filters_msa[yoffset];
1882
1883 if (yoffset) {
1884 if (xoffset) {
1885 *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
1886 ref_ptr, ref_stride,
1887 sec_pred, h_filter,
1888 v_filter, 64, &diff);
1889 } else {
1890 *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
1891 ref_ptr, ref_stride,
1892 sec_pred, v_filter,
1893 64, &diff);
1894 }
1895 } else {
1896 if (xoffset) {
1897 *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
1898 ref_ptr, ref_stride,
1899 sec_pred, h_filter,
1900 64, &diff);
1901 } else {
1902 *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1903 sec_pred, &diff);
1904 }
1905 }
1906
1907 return VARIANCE_32Wx64H(*sse, diff);
1908 }
1909
1910 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
1911 uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \
1912 int32_t src_stride, \
1913 int32_t xoffset, \
1914 int32_t yoffset, \
1915 const uint8_t *ref_ptr, \
1916 int32_t ref_stride, \
1917 uint32_t *sse, \
1918 const uint8_t *sec_pred) { \
1919 int32_t diff; \
1920 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
1921 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
1922 \
1923 if (yoffset) { \
1924 if (xoffset) { \
1925 *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \
1926 ref_ptr, ref_stride, \
1927 sec_pred, h_filter, \
1928 v_filter, ht, &diff); \
1929 } else { \
1930 *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \
1931 ref_ptr, ref_stride, \
1932 sec_pred, v_filter, \
1933 ht, &diff); \
1934 } \
1935 } else { \
1936 if (xoffset) { \
1937 *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \
1938 ref_ptr, ref_stride, \
1939 sec_pred, h_filter, \
1940 ht, &diff); \
1941 } else { \
1942 *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \
1943 ref_ptr, ref_stride, \
1944 sec_pred, &diff); \
1945 } \
1946 } \
1947 \
1948 return VARIANCE_64Wx##ht##H(*sse, diff); \
1949 }
1950
1951 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1952 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1953