1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vp9_rtcd.h"
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_scale_rtcd.h"
15 
16 #include "vp9/common/vp9_onyxc_int.h"
17 #include "vp9/common/vp9_postproc.h"
18 
19 // TODO(jackychen): Replace this function with SSE2 code. There is
20 // one SSE2 implementation in vp8, so will consider how to share it
21 // between vp8 and vp9.
filter_by_weight(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int block_size,int src_weight)22 static void filter_by_weight(const uint8_t *src, int src_stride,
23                              uint8_t *dst, int dst_stride,
24                              int block_size, int src_weight) {
25   const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
26   const int rounding_bit = 1 << (MFQE_PRECISION - 1);
27   int r, c;
28 
29   for (r = 0; r < block_size; r++) {
30     for (c = 0; c < block_size; c++) {
31       dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
32                >> MFQE_PRECISION;
33     }
34     src += src_stride;
35     dst += dst_stride;
36   }
37 }
38 
vp9_filter_by_weight8x8_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)39 void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
40                                uint8_t *dst, int dst_stride, int src_weight) {
41   filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
42 }
43 
vp9_filter_by_weight16x16_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)44 void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
45                                  uint8_t *dst, int dst_stride,
46                                  int src_weight) {
47   filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
48 }
49 
filter_by_weight32x32(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int weight)50 static void filter_by_weight32x32(const uint8_t *src, int src_stride,
51                                   uint8_t *dst, int dst_stride, int weight) {
52   vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
53   vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
54                             weight);
55   vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
56                             dst + dst_stride * 16, dst_stride, weight);
57   vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
58                             dst + dst_stride * 16 + 16, dst_stride, weight);
59 }
60 
filter_by_weight64x64(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int weight)61 static void filter_by_weight64x64(const uint8_t *src, int src_stride,
62                                   uint8_t *dst, int dst_stride, int weight) {
63   filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
64   filter_by_weight32x32(src + 32, src_stride, dst + 32,
65                         dst_stride, weight);
66   filter_by_weight32x32(src + src_stride * 32, src_stride,
67                         dst + dst_stride * 32, dst_stride, weight);
68   filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
69                         dst + dst_stride * 32 + 32, dst_stride, weight);
70 }
71 
apply_ifactor(const uint8_t * y,int y_stride,uint8_t * yd,int yd_stride,const uint8_t * u,const uint8_t * v,int uv_stride,uint8_t * ud,uint8_t * vd,int uvd_stride,BLOCK_SIZE block_size,int weight)72 static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
73                           int yd_stride, const uint8_t *u, const uint8_t *v,
74                           int uv_stride, uint8_t *ud, uint8_t *vd,
75                           int uvd_stride, BLOCK_SIZE block_size,
76                           int weight) {
77   if (block_size == BLOCK_16X16) {
78     vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
79     vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
80     vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
81   } else if (block_size == BLOCK_32X32) {
82     filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
83     vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
84     vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
85   } else if (block_size == BLOCK_64X64) {
86     filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
87     filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
88     filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
89   }
90 }
91 
92 // TODO(jackychen): Determine whether replace it with assembly code.
copy_mem8x8(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)93 static void copy_mem8x8(const uint8_t *src, int src_stride,
94                         uint8_t *dst, int dst_stride) {
95   int r;
96   for (r = 0; r < 8; r++) {
97     memcpy(dst, src, 8);
98     src += src_stride;
99     dst += dst_stride;
100   }
101 }
102 
copy_mem16x16(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)103 static void copy_mem16x16(const uint8_t *src, int src_stride,
104                           uint8_t *dst, int dst_stride) {
105   int r;
106   for (r = 0; r < 16; r++) {
107     memcpy(dst, src, 16);
108     src += src_stride;
109     dst += dst_stride;
110   }
111 }
112 
copy_mem32x32(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)113 static void copy_mem32x32(const uint8_t *src, int src_stride,
114                           uint8_t *dst, int dst_stride) {
115   copy_mem16x16(src, src_stride, dst, dst_stride);
116   copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
117   copy_mem16x16(src + src_stride * 16, src_stride,
118                 dst + dst_stride * 16, dst_stride);
119   copy_mem16x16(src + src_stride * 16 + 16, src_stride,
120                 dst + dst_stride * 16 + 16, dst_stride);
121 }
122 
copy_mem64x64(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)123 static void copy_mem64x64(const uint8_t *src, int src_stride,
124                           uint8_t *dst, int dst_stride) {
125   copy_mem32x32(src, src_stride, dst, dst_stride);
126   copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
127   copy_mem32x32(src + src_stride * 32, src_stride,
128                 dst + src_stride * 32, dst_stride);
129   copy_mem32x32(src + src_stride * 32 + 32, src_stride,
130                 dst + src_stride * 32 + 32, dst_stride);
131 }
132 
copy_block(const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride,BLOCK_SIZE bs)133 static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
134                        int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
135                        uint8_t *vd, int yd_stride, int uvd_stride,
136                        BLOCK_SIZE bs) {
137   if (bs == BLOCK_16X16) {
138     copy_mem16x16(y, y_stride, yd, yd_stride);
139     copy_mem8x8(u, uv_stride, ud, uvd_stride);
140     copy_mem8x8(v, uv_stride, vd, uvd_stride);
141   } else if (bs == BLOCK_32X32) {
142     copy_mem32x32(y, y_stride, yd, yd_stride);
143     copy_mem16x16(u, uv_stride, ud, uvd_stride);
144     copy_mem16x16(v, uv_stride, vd, uvd_stride);
145   } else {
146     copy_mem64x64(y, y_stride, yd, yd_stride);
147     copy_mem32x32(u, uv_stride, ud, uvd_stride);
148     copy_mem32x32(v, uv_stride, vd, uvd_stride);
149   }
150 }
151 
get_thr(BLOCK_SIZE bs,int qdiff,int * sad_thr,int * vdiff_thr)152 static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
153   const int adj = qdiff >> MFQE_PRECISION;
154   if (bs == BLOCK_16X16) {
155     *sad_thr = 7 + adj;
156   } else if (bs == BLOCK_32X32) {
157     *sad_thr = 6 + adj;
158   } else {  // BLOCK_64X64
159     *sad_thr = 5 + adj;
160   }
161   *vdiff_thr = 125 + qdiff;
162 }
163 
mfqe_block(BLOCK_SIZE bs,const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride,int qdiff)164 static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
165                        const uint8_t *v, int y_stride, int uv_stride,
166                        uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
167                        int uvd_stride, int qdiff) {
168   int sad, sad_thr, vdiff, vdiff_thr;
169   uint32_t sse;
170 
171   get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
172 
173   if (bs == BLOCK_16X16) {
174     vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
175     sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
176   } else if (bs == BLOCK_32X32) {
177     vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
178     sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
179   } else /* if (bs == BLOCK_64X64) */ {
180     vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
181     sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
182   }
183 
184   // vdiff > sad * 3 means vdiff should not be too small, otherwise,
185   // it might be a lighting change in smooth area. When there is a
186   // lighting change in smooth area, it is dangerous to do MFQE.
187   if (sad > 1 && vdiff > sad * 3) {
188     const int weight = 1 << MFQE_PRECISION;
189     int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
190     // When ifactor equals weight, no MFQE is done.
191     if (ifactor > weight) {
192       ifactor = weight;
193     }
194     apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
195                   uvd_stride, bs, ifactor);
196   } else {
197     // Copy the block from current frame (i.e., no mfqe is done).
198     copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
199                yd_stride, uvd_stride, bs);
200   }
201 }
202 
mfqe_decision(MODE_INFO * mi,BLOCK_SIZE cur_bs)203 static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
204   // Check the motion in current block(for inter frame),
205   // or check the motion in the correlated block in last frame (for keyframe).
206   const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
207                             mi->mbmi.mv[0].as_mv.row +
208                             mi->mbmi.mv[0].as_mv.col *
209                             mi->mbmi.mv[0].as_mv.col;
210   const int mv_threshold = 100;
211   return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
212          cur_bs >= BLOCK_16X16 &&
213          mv_len_square <= mv_threshold;
214 }
215 
216 // Process each partiton in a super block, recursively.
mfqe_partition(VP9_COMMON * cm,MODE_INFO * mi,BLOCK_SIZE bs,const uint8_t * y,const uint8_t * u,const uint8_t * v,int y_stride,int uv_stride,uint8_t * yd,uint8_t * ud,uint8_t * vd,int yd_stride,int uvd_stride)217 static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
218                            const uint8_t *y, const uint8_t *u,
219                            const uint8_t *v, int y_stride, int uv_stride,
220                            uint8_t *yd, uint8_t *ud, uint8_t *vd,
221                            int yd_stride, int uvd_stride) {
222   int mi_offset, y_offset, uv_offset;
223   const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
224   const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
225   const int bsl = b_width_log2_lookup[bs];
226   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
227   const BLOCK_SIZE subsize = get_subsize(bs, partition);
228 
229   if (cur_bs < BLOCK_8X8) {
230     // If there are blocks smaller than 8x8, it must be on the boundary.
231     return;
232   }
233   // No MFQE on blocks smaller than 16x16
234   if (bs == BLOCK_16X16) {
235     partition = PARTITION_NONE;
236   }
237   if (bs == BLOCK_64X64) {
238     mi_offset = 4;
239     y_offset = 32;
240     uv_offset = 16;
241   } else {
242     mi_offset = 2;
243     y_offset = 16;
244     uv_offset = 8;
245   }
246   switch (partition) {
247     BLOCK_SIZE mfqe_bs, bs_tmp;
248     case PARTITION_HORZ:
249       if (bs == BLOCK_64X64) {
250         mfqe_bs = BLOCK_64X32;
251         bs_tmp = BLOCK_32X32;
252       } else {
253         mfqe_bs = BLOCK_32X16;
254         bs_tmp = BLOCK_16X16;
255       }
256       if (mfqe_decision(mi, mfqe_bs)) {
257         // Do mfqe on the first square partition.
258         mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
259                    yd, ud, vd, yd_stride, uvd_stride, qdiff);
260         // Do mfqe on the second square partition.
261         mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
262                    y_stride, uv_stride, yd + y_offset, ud + uv_offset,
263                    vd + uv_offset, yd_stride, uvd_stride, qdiff);
264       }
265       if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
266         // Do mfqe on the first square partition.
267         mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
268                    v + uv_offset * uv_stride, y_stride, uv_stride,
269                    yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
270                    vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
271         // Do mfqe on the second square partition.
272         mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
273                    u + uv_offset * uv_stride + uv_offset,
274                    v + uv_offset * uv_stride + uv_offset, y_stride,
275                    uv_stride, yd + y_offset * yd_stride + y_offset,
276                    ud + uv_offset * uvd_stride + uv_offset,
277                    vd + uv_offset * uvd_stride + uv_offset,
278                    yd_stride, uvd_stride, qdiff);
279       }
280       break;
281     case PARTITION_VERT:
282       if (bs == BLOCK_64X64) {
283         mfqe_bs = BLOCK_32X64;
284         bs_tmp = BLOCK_32X32;
285       } else {
286         mfqe_bs = BLOCK_16X32;
287         bs_tmp = BLOCK_16X16;
288       }
289       if (mfqe_decision(mi, mfqe_bs)) {
290         // Do mfqe on the first square partition.
291         mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
292                    yd, ud, vd, yd_stride, uvd_stride, qdiff);
293         // Do mfqe on the second square partition.
294         mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
295                    v + uv_offset * uv_stride, y_stride, uv_stride,
296                    yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
297                    vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
298       }
299       if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
300         // Do mfqe on the first square partition.
301         mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
302                    y_stride, uv_stride, yd + y_offset, ud + uv_offset,
303                    vd + uv_offset, yd_stride, uvd_stride, qdiff);
304         // Do mfqe on the second square partition.
305         mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
306                    u + uv_offset * uv_stride + uv_offset,
307                    v + uv_offset * uv_stride + uv_offset, y_stride,
308                    uv_stride, yd + y_offset * yd_stride + y_offset,
309                    ud + uv_offset * uvd_stride + uv_offset,
310                    vd + uv_offset * uvd_stride + uv_offset,
311                    yd_stride, uvd_stride, qdiff);
312       }
313       break;
314     case PARTITION_NONE:
315       if (mfqe_decision(mi, cur_bs)) {
316         // Do mfqe on this partition.
317         mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
318                    yd, ud, vd, yd_stride, uvd_stride, qdiff);
319       } else {
320         // Copy the block from current frame(i.e., no mfqe is done).
321         copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
322                    yd_stride, uvd_stride, bs);
323       }
324       break;
325     case PARTITION_SPLIT:
326       // Recursion on four square partitions, e.g. if bs is 64X64,
327       // then look into four 32X32 blocks in it.
328       mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
329                      yd_stride, uvd_stride);
330       mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
331                      v + uv_offset, y_stride, uv_stride, yd + y_offset,
332                      ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
333       mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
334                      y + y_offset * y_stride, u + uv_offset * uv_stride,
335                      v + uv_offset * uv_stride, y_stride, uv_stride,
336                      yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
337                      vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
338       mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
339                      subsize, y + y_offset * y_stride + y_offset,
340                      u + uv_offset * uv_stride + uv_offset,
341                      v + uv_offset * uv_stride + uv_offset, y_stride,
342                      uv_stride, yd + y_offset * yd_stride + y_offset,
343                      ud + uv_offset * uvd_stride + uv_offset,
344                      vd + uv_offset * uvd_stride + uv_offset,
345                      yd_stride, uvd_stride);
346       break;
347     default:
348       assert(0);
349   }
350 }
351 
vp9_mfqe(VP9_COMMON * cm)352 void vp9_mfqe(VP9_COMMON *cm) {
353   int mi_row, mi_col;
354   // Current decoded frame.
355   const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
356   // Last decoded frame and will store the MFQE result.
357   YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
358   // Loop through each super block.
359   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
360     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
361       MODE_INFO *mi;
362       MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
363       // Motion Info in last frame.
364       MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
365                            (mi_row * cm->mi_stride + mi_col);
366       const uint32_t y_stride = show->y_stride;
367       const uint32_t uv_stride = show->uv_stride;
368       const uint32_t yd_stride = dest->y_stride;
369       const uint32_t uvd_stride = dest->uv_stride;
370       const uint32_t row_offset_y = mi_row << 3;
371       const uint32_t row_offset_uv = mi_row << 2;
372       const uint32_t col_offset_y = mi_col << 3;
373       const uint32_t col_offset_uv = mi_col << 2;
374       const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
375                          col_offset_y;
376       const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
377                          col_offset_uv;
378       const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
379                          col_offset_uv;
380       uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
381       uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
382                     col_offset_uv;
383       uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
384                     col_offset_uv;
385       if (frame_is_intra_only(cm)) {
386         mi = mi_prev;
387       } else {
388         mi = mi_local;
389       }
390       mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
391                      vd, yd_stride, uvd_stride);
392     }
393   }
394 }
395