1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 #include "./vp8_rtcd.h"
13 #include "vp8/common/mips/msa/vp8_macros_msa.h"
14 #include "vp8/encoder/denoising.h"
15 
vp8_denoiser_filter_msa(uint8_t * mc_running_avg_y_ptr,int32_t mc_avg_y_stride,uint8_t * running_avg_y_ptr,int32_t avg_y_stride,uint8_t * sig_ptr,int32_t sig_stride,uint32_t motion_magnitude,int32_t increase_denoising)16 int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr,
17                                 int32_t mc_avg_y_stride,
18                                 uint8_t *running_avg_y_ptr,
19                                 int32_t avg_y_stride, uint8_t *sig_ptr,
20                                 int32_t sig_stride, uint32_t motion_magnitude,
21                                 int32_t increase_denoising) {
22   uint8_t *running_avg_y_start = running_avg_y_ptr;
23   uint8_t *sig_start = sig_ptr;
24   int32_t cnt = 0;
25   int32_t sum_diff = 0;
26   int32_t shift_inc1 = 3;
27   int32_t delta = 0;
28   int32_t sum_diff_thresh;
29   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
30   v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
31   v16u8 mc_running_avg_y0, running_avg_y, sig0;
32   v16u8 mc_running_avg_y1, running_avg_y1, sig1;
33   v16u8 coeff0, coeff1;
34   v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1;
35   v8i16 adjust0, adjust1, adjust2, adjust3;
36   v8i16 shift_inc1_vec = { 0 };
37   v8i16 col_sum0 = { 0 };
38   v8i16 col_sum1 = { 0 };
39   v8i16 col_sum2 = { 0 };
40   v8i16 col_sum3 = { 0 };
41   v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec;
42   v4i32 temp0_w;
43   v2i64 temp0_d, temp1_d;
44   v8i16 zero = { 0 };
45   v8i16 one = __msa_ldi_h(1);
46   v8i16 four = __msa_ldi_h(4);
47   v8i16 val_127 = __msa_ldi_h(127);
48   v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
49 
50   if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
51     adj_val = __msa_add_a_h(adj_val, one);
52     if (increase_denoising) {
53       adj_val = __msa_add_a_h(adj_val, one);
54       shift_inc1 = 4;
55     }
56 
57     temp0_h = zero - adj_val;
58     adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
59   }
60 
61   adj_val = __msa_insert_h(adj_val, 3, cnt);
62   adj_val = __msa_insert_h(adj_val, 7, cnt);
63   shift_inc1_vec = __msa_fill_h(shift_inc1);
64 
65   for (cnt = 8; cnt--;) {
66     v8i16 mask0 = { 0 };
67     v8i16 mask1 = { 0 };
68 
69     mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
70     sig0 = LD_UB(sig_ptr);
71     sig_ptr += sig_stride;
72     mc_running_avg_y_ptr += mc_avg_y_stride;
73 
74     mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
75     sig1 = LD_UB(sig_ptr);
76 
77     ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
78     HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
79     abs_diff0 = __msa_add_a_h(diff0, zero);
80     abs_diff1 = __msa_add_a_h(diff1, zero);
81     cmp = __msa_clei_s_h(abs_diff0, 15);
82     cmp = cmp & one;
83     mask0 += cmp;
84     cmp = __msa_clei_s_h(abs_diff0, 7);
85     cmp = cmp & one;
86     mask0 += cmp;
87     cmp = abs_diff0 < shift_inc1_vec;
88     cmp = cmp & one;
89     mask0 += cmp;
90     cmp = __msa_clei_s_h(abs_diff1, 15);
91     cmp = cmp & one;
92     mask1 += cmp;
93     cmp = __msa_clei_s_h(abs_diff1, 7);
94     cmp = cmp & one;
95     mask1 += cmp;
96     cmp = abs_diff1 < shift_inc1_vec;
97     cmp = cmp & one;
98     mask1 += cmp;
99     temp0_h = __msa_clei_s_h(diff0, 0);
100     temp0_h = temp0_h & four;
101     mask0 += temp0_h;
102     temp1_h = __msa_clei_s_h(diff1, 0);
103     temp1_h = temp1_h & four;
104     mask1 += temp1_h;
105     VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
106                adjust1);
107     temp2_h = __msa_ceqi_h(adjust0, 0);
108     temp3_h = __msa_ceqi_h(adjust1, 0);
109     adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
110     adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
111     ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
112     UNPCK_UB_SH(sig0, temp0_h, temp1_h);
113     ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
114     MAXI_SH2_SH(temp0_h, temp1_h, 0);
115     SAT_UH2_SH(temp0_h, temp1_h, 7);
116     temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
117     running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
118     running_avg_y =
119         __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
120     ST_UB(running_avg_y, running_avg_y_ptr);
121     running_avg_y_ptr += avg_y_stride;
122 
123     mask0 = zero;
124     mask1 = zero;
125     ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
126     HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
127     abs_diff0 = __msa_add_a_h(diff0, zero);
128     abs_diff1 = __msa_add_a_h(diff1, zero);
129     cmp = __msa_clei_s_h(abs_diff0, 15);
130     cmp = cmp & one;
131     mask0 += cmp;
132     cmp = __msa_clei_s_h(abs_diff0, 7);
133     cmp = cmp & one;
134     mask0 += cmp;
135     cmp = abs_diff0 < shift_inc1_vec;
136     cmp = cmp & one;
137     mask0 += cmp;
138     cmp = __msa_clei_s_h(abs_diff1, 15);
139     cmp = cmp & one;
140     mask1 += cmp;
141     cmp = __msa_clei_s_h(abs_diff1, 7);
142     cmp = cmp & one;
143     mask1 += cmp;
144     cmp = abs_diff1 < shift_inc1_vec;
145     cmp = cmp & one;
146     mask1 += cmp;
147     temp0_h = __msa_clei_s_h(diff0, 0);
148     temp0_h = temp0_h & four;
149     mask0 += temp0_h;
150     temp1_h = __msa_clei_s_h(diff1, 0);
151     temp1_h = temp1_h & four;
152     mask1 += temp1_h;
153     VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
154                adjust1);
155     temp2_h = __msa_ceqi_h(adjust0, 0);
156     temp3_h = __msa_ceqi_h(adjust1, 0);
157     adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
158     adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
159     ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
160     UNPCK_UB_SH(sig1, temp0_h, temp1_h);
161     ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
162     MAXI_SH2_SH(temp0_h, temp1_h, 0);
163     SAT_UH2_SH(temp0_h, temp1_h, 7);
164     temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
165     running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
166     running_avg_y =
167         __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
168     ST_UB(running_avg_y, running_avg_y_ptr);
169     sig_ptr += sig_stride;
170     mc_running_avg_y_ptr += mc_avg_y_stride;
171     running_avg_y_ptr += avg_y_stride;
172   }
173 
174   col_sum0 = __msa_min_s_h(col_sum0, val_127);
175   col_sum1 = __msa_min_s_h(col_sum1, val_127);
176   temp0_h = col_sum0 + col_sum1;
177   temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
178   temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
179   temp1_d = __msa_splati_d(temp0_d, 1);
180   temp0_d += temp1_d;
181   sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
182   sig_ptr -= sig_stride * 16;
183   mc_running_avg_y_ptr -= mc_avg_y_stride * 16;
184   running_avg_y_ptr -= avg_y_stride * 16;
185 
186   if (increase_denoising) {
187     sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
188   }
189 
190   if (abs(sum_diff) > sum_diff_thresh) {
191     delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
192     delta_vec = __msa_fill_h(delta);
193     if (delta < 4) {
194       for (cnt = 8; cnt--;) {
195         running_avg_y = LD_UB(running_avg_y_ptr);
196         mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
197         sig0 = LD_UB(sig_ptr);
198         sig_ptr += sig_stride;
199         mc_running_avg_y_ptr += mc_avg_y_stride;
200         running_avg_y_ptr += avg_y_stride;
201         mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
202         sig1 = LD_UB(sig_ptr);
203         running_avg_y1 = LD_UB(running_avg_y_ptr);
204         ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
205         HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
206         abs_diff0 = __msa_add_a_h(diff0, zero);
207         abs_diff1 = __msa_add_a_h(diff1, zero);
208         temp0_h = abs_diff0 < delta_vec;
209         temp1_h = abs_diff1 < delta_vec;
210         abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
211                                        (v16u8)temp0_h);
212         abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
213                                        (v16u8)temp1_h);
214         SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
215         abs_diff_neg0 = zero - abs_diff0;
216         abs_diff_neg1 = zero - abs_diff1;
217         temp0_h = __msa_clei_s_h(diff0, 0);
218         temp1_h = __msa_clei_s_h(diff1, 0);
219         adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
220                                       (v16u8)temp0_h);
221         adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
222                                       (v16u8)temp1_h);
223         ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h);
224         ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
225         MAXI_SH2_SH(adjust2, adjust3, 0);
226         SAT_UH2_SH(adjust2, adjust3, 7);
227         temp0_h = __msa_ceqi_h(diff0, 0);
228         temp1_h = __msa_ceqi_h(diff1, 0);
229         adjust2 =
230             (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
231         adjust3 =
232             (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
233         adjust0 =
234             (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
235         adjust1 =
236             (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
237         ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
238         running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
239         ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride);
240         ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
241         HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
242         abs_diff0 = __msa_add_a_h(diff0, zero);
243         abs_diff1 = __msa_add_a_h(diff1, zero);
244         temp0_h = abs_diff0 < delta_vec;
245         temp1_h = abs_diff1 < delta_vec;
246         abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
247                                        (v16u8)temp0_h);
248         abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
249                                        (v16u8)temp1_h);
250         SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
251         temp0_h = __msa_clei_s_h(diff0, 0);
252         temp1_h = __msa_clei_s_h(diff1, 0);
253         adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
254                                       (v16u8)temp0_h);
255         adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
256                                       (v16u8)temp1_h);
257         ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h);
258         ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
259         MAXI_SH2_SH(adjust2, adjust3, 0);
260         SAT_UH2_SH(adjust2, adjust3, 7);
261         temp0_h = __msa_ceqi_h(diff0, 0);
262         temp1_h = __msa_ceqi_h(diff1, 0);
263         adjust2 =
264             (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
265         adjust3 =
266             (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
267         adjust0 =
268             (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
269         adjust1 =
270             (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
271         ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
272         running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
273         ST_UB(running_avg_y, running_avg_y_ptr);
274         running_avg_y_ptr += avg_y_stride;
275       }
276 
277       col_sum2 = __msa_min_s_h(col_sum2, val_127);
278       col_sum3 = __msa_min_s_h(col_sum3, val_127);
279       temp0_h = col_sum2 + col_sum3;
280       temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
281       temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
282       temp1_d = __msa_splati_d(temp0_d, 1);
283       temp0_d += (v2i64)temp1_d;
284       sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
285       if (abs(sum_diff) > SUM_DIFF_THRESHOLD) {
286         return COPY_BLOCK;
287       }
288     } else {
289       return COPY_BLOCK;
290     }
291   }
292 
293   LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6, src7);
294   sig_start += (8 * sig_stride);
295   LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13, src14,
296          src15);
297 
298   ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start,
299          avg_y_stride);
300   running_avg_y_start += (8 * avg_y_stride);
301   ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
302          running_avg_y_start, avg_y_stride);
303 
304   return FILTER_BLOCK;
305 }
306 
vp8_denoiser_filter_uv_msa(uint8_t * mc_running_avg_y_ptr,int32_t mc_avg_y_stride,uint8_t * running_avg_y_ptr,int32_t avg_y_stride,uint8_t * sig_ptr,int32_t sig_stride,uint32_t motion_magnitude,int32_t increase_denoising)307 int32_t vp8_denoiser_filter_uv_msa(
308     uint8_t *mc_running_avg_y_ptr, int32_t mc_avg_y_stride,
309     uint8_t *running_avg_y_ptr, int32_t avg_y_stride, uint8_t *sig_ptr,
310     int32_t sig_stride, uint32_t motion_magnitude, int32_t increase_denoising) {
311   uint8_t *running_avg_y_start = running_avg_y_ptr;
312   uint8_t *sig_start = sig_ptr;
313   int32_t cnt = 0;
314   int32_t sum_diff = 0;
315   int32_t shift_inc1 = 3;
316   int32_t delta = 0;
317   int32_t sum_block = 0;
318   int32_t sum_diff_thresh;
319   int64_t dst0, dst1, src0, src1, src2, src3;
320   v16u8 mc_running_avg_y0, running_avg_y, sig0;
321   v16u8 mc_running_avg_y1, running_avg_y1, sig1;
322   v16u8 sig2, sig3, sig4, sig5, sig6, sig7;
323   v16u8 coeff0;
324   v8i16 diff0, abs_diff0, abs_diff_neg0;
325   v8i16 adjust0, adjust2;
326   v8i16 shift_inc1_vec = { 0 };
327   v8i16 col_sum0 = { 0 };
328   v8i16 temp0_h, temp2_h, cmp, delta_vec;
329   v4i32 temp0_w;
330   v2i64 temp0_d, temp1_d;
331   v16i8 zero = { 0 };
332   v8i16 one = __msa_ldi_h(1);
333   v8i16 four = __msa_ldi_h(4);
334   v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
335 
336   sig0 = LD_UB(sig_ptr);
337   sig_ptr += sig_stride;
338   temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
339   sig1 = LD_UB(sig_ptr);
340   sig_ptr += sig_stride;
341   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
342   sig2 = LD_UB(sig_ptr);
343   sig_ptr += sig_stride;
344   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2);
345   sig3 = LD_UB(sig_ptr);
346   sig_ptr += sig_stride;
347   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3);
348   sig4 = LD_UB(sig_ptr);
349   sig_ptr += sig_stride;
350   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4);
351   sig5 = LD_UB(sig_ptr);
352   sig_ptr += sig_stride;
353   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5);
354   sig6 = LD_UB(sig_ptr);
355   sig_ptr += sig_stride;
356   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6);
357   sig7 = LD_UB(sig_ptr);
358   sig_ptr += sig_stride;
359   temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7);
360   temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
361   temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
362   temp1_d = __msa_splati_d(temp0_d, 1);
363   temp0_d += temp1_d;
364   sum_block = __msa_copy_s_w((v4i32)temp0_d, 0);
365   sig_ptr -= sig_stride * 8;
366 
367   if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
368     return COPY_BLOCK;
369   }
370 
371   if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
372     adj_val = __msa_add_a_h(adj_val, one);
373 
374     if (increase_denoising) {
375       adj_val = __msa_add_a_h(adj_val, one);
376       shift_inc1 = 4;
377     }
378 
379     temp0_h = (v8i16)zero - adj_val;
380     adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
381   }
382 
383   adj_val = __msa_insert_h(adj_val, 3, cnt);
384   adj_val = __msa_insert_h(adj_val, 7, cnt);
385   shift_inc1_vec = __msa_fill_h(shift_inc1);
386   for (cnt = 4; cnt--;) {
387     v8i16 mask0 = { 0 };
388     mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
389     sig0 = LD_UB(sig_ptr);
390     sig_ptr += sig_stride;
391     mc_running_avg_y_ptr += mc_avg_y_stride;
392     mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
393     sig1 = LD_UB(sig_ptr);
394     coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
395     diff0 = __msa_hsub_u_h(coeff0, coeff0);
396     abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
397     cmp = __msa_clei_s_h(abs_diff0, 15);
398     cmp = cmp & one;
399     mask0 += cmp;
400     cmp = __msa_clei_s_h(abs_diff0, 7);
401     cmp = cmp & one;
402     mask0 += cmp;
403     cmp = abs_diff0 < shift_inc1_vec;
404     cmp = cmp & one;
405     mask0 += cmp;
406     temp0_h = __msa_clei_s_h(diff0, 0);
407     temp0_h = temp0_h & four;
408     mask0 += temp0_h;
409     adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
410     temp2_h = __msa_ceqi_h(adjust0, 0);
411     adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
412     col_sum0 += adjust0;
413     temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
414     temp0_h += adjust0;
415     temp0_h = __msa_maxi_s_h(temp0_h, 0);
416     temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
417     temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
418     running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
419     running_avg_y =
420         __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
421     dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
422     SD(dst0, running_avg_y_ptr);
423     running_avg_y_ptr += avg_y_stride;
424 
425     mask0 = __msa_ldi_h(0);
426     coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
427     diff0 = __msa_hsub_u_h(coeff0, coeff0);
428     abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
429     cmp = __msa_clei_s_h(abs_diff0, 15);
430     cmp = cmp & one;
431     mask0 += cmp;
432     cmp = __msa_clei_s_h(abs_diff0, 7);
433     cmp = cmp & one;
434     mask0 += cmp;
435     cmp = abs_diff0 < shift_inc1_vec;
436     cmp = cmp & one;
437     mask0 += cmp;
438     temp0_h = __msa_clei_s_h(diff0, 0);
439     temp0_h = temp0_h & four;
440     mask0 += temp0_h;
441     adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
442     temp2_h = __msa_ceqi_h(adjust0, 0);
443     adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
444     col_sum0 += adjust0;
445     temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
446     temp0_h += adjust0;
447     temp0_h = __msa_maxi_s_h(temp0_h, 0);
448     temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
449 
450     temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
451     running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
452     running_avg_y =
453         __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
454     dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
455     SD(dst1, running_avg_y_ptr);
456 
457     sig_ptr += sig_stride;
458     mc_running_avg_y_ptr += mc_avg_y_stride;
459     running_avg_y_ptr += avg_y_stride;
460   }
461 
462   temp0_h = col_sum0;
463   temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
464   temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
465   temp1_d = __msa_splati_d(temp0_d, 1);
466   temp0_d += temp1_d;
467   sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
468   sig_ptr -= sig_stride * 8;
469   mc_running_avg_y_ptr -= mc_avg_y_stride * 8;
470   running_avg_y_ptr -= avg_y_stride * 8;
471   sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
472 
473   if (increase_denoising) {
474     sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
475   }
476 
477   if (abs(sum_diff) > sum_diff_thresh) {
478     delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
479     delta_vec = __msa_fill_h(delta);
480     if (delta < 4) {
481       for (cnt = 4; cnt--;) {
482         running_avg_y = LD_UB(running_avg_y_ptr);
483         mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
484         sig0 = LD_UB(sig_ptr);
485         /* Update pointers for next iteration. */
486         sig_ptr += sig_stride;
487         mc_running_avg_y_ptr += mc_avg_y_stride;
488         running_avg_y_ptr += avg_y_stride;
489 
490         mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
491         sig1 = LD_UB(sig_ptr);
492         running_avg_y1 = LD_UB(running_avg_y_ptr);
493 
494         coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
495         diff0 = __msa_hsub_u_h(coeff0, coeff0);
496         abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
497         temp0_h = delta_vec < abs_diff0;
498         abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
499                                         (v16u8)temp0_h);
500         abs_diff_neg0 = (v8i16)zero - abs_diff0;
501         temp0_h = __msa_clei_s_h(diff0, 0);
502         adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
503                                      (v16u8)temp0_h);
504         temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y);
505         adjust2 = temp2_h + adjust0;
506         adjust2 = __msa_maxi_s_h(adjust2, 0);
507         adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
508         temp0_h = __msa_ceqi_h(diff0, 0);
509         adjust2 =
510             (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
511         adjust0 =
512             (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
513         col_sum0 += adjust0;
514         running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
515         dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
516         SD(dst0, running_avg_y_ptr - avg_y_stride);
517 
518         coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
519         diff0 = __msa_hsub_u_h(coeff0, coeff0);
520         abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
521         temp0_h = delta_vec < abs_diff0;
522         abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
523                                         (v16u8)temp0_h);
524         abs_diff_neg0 = (v8i16)zero - abs_diff0;
525         temp0_h = __msa_clei_s_h(diff0, 0);
526         adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
527                                      (v16u8)temp0_h);
528         temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1);
529         adjust2 = temp2_h + adjust0;
530         adjust2 = __msa_maxi_s_h(adjust2, 0);
531         adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
532         temp0_h = __msa_ceqi_h(diff0, 0);
533         adjust2 =
534             (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
535         adjust0 =
536             (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
537         col_sum0 += adjust0;
538         running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
539         dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
540         SD(dst1, running_avg_y_ptr);
541         running_avg_y_ptr += avg_y_stride;
542       }
543 
544       temp0_h = col_sum0;
545       temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
546       temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
547       temp1_d = __msa_splati_d(temp0_d, 1);
548       temp0_d += temp1_d;
549       sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
550 
551       if (abs(sum_diff) > sum_diff_thresh) {
552         return COPY_BLOCK;
553       }
554     } else {
555       return COPY_BLOCK;
556     }
557   }
558 
559   LD4(sig_start, sig_stride, src0, src1, src2, src3);
560   sig_start += (4 * sig_stride);
561   SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
562   running_avg_y_start += (4 * avg_y_stride);
563 
564   LD4(sig_start, sig_stride, src0, src1, src2, src3);
565   SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
566 
567   return FILTER_BLOCK;
568 }
569