1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  *
11  */
12 
13 #include <math.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom_mem/aom_mem.h"
20 #include "av1/common/onyxc_int.h"
21 #include "av1/common/resize.h"
22 #include "av1/common/restoration.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_mem/aom_mem.h"
25 
26 #include "aom_ports/mem.h"
27 
28 // The 's' values are calculated based on original 'r' and 'e' values in the
29 // spec using GenSgrprojVtable().
30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
32   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40 };
41 
av1_whole_frame_rect(const AV1_COMMON * cm,int is_uv)42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43   AV1PixelRect rect;
44 
45   int ss_x = is_uv && cm->seq_params.subsampling_x;
46   int ss_y = is_uv && cm->seq_params.subsampling_y;
47 
48   rect.top = 0;
49   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50   rect.left = 0;
51   rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52   return rect;
53 }
54 
55 // Count horizontal or vertical units per tile (use a width or height for
56 // tile_size, respectively). We basically want to divide the tile size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height. The
60 // max with 1 is to deal with tiles that are smaller than half of a restoration
61 // unit.
av1_lr_count_units_in_tile(int unit_size,int tile_size)62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63   return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64 }
65 
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67                                   int is_uv) {
68   // We need to allocate enough space for restoration units to cover the
69   // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70   // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71   // to do the computation ourselves, iterating over the tiles and keeping
72   // track of the largest width and height, then upscaling.
73   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74   const int max_tile_w = tile_rect.right - tile_rect.left;
75   const int max_tile_h = tile_rect.bottom - tile_rect.top;
76 
77   // To calculate hpertile and vpertile (horizontal and vertical units per
78   // tile), we basically want to divide the largest tile width or height by the
79   // size of a restoration unit. Rather than rounding up unconditionally as you
80   // might expect, we round to nearest, which models the way a right or bottom
81   // restoration unit can extend to up to 150% its normal width or height. The
82   // max with 1 is to deal with tiles that are smaller than half of a
83   // restoration unit.
84   const int unit_size = rsi->restoration_unit_size;
85   const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86   const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87 
88   rsi->units_per_tile = hpertile * vpertile;
89   rsi->horz_units_per_tile = hpertile;
90   rsi->vert_units_per_tile = vpertile;
91 
92   const int ntiles = 1;
93   const int nunits = ntiles * rsi->units_per_tile;
94 
95   aom_free(rsi->unit_info);
96   CHECK_MEM_ERROR(cm, rsi->unit_info,
97                   (RestorationUnitInfo *)aom_memalign(
98                       16, sizeof(*rsi->unit_info) * nunits));
99 }
100 
av1_free_restoration_struct(RestorationInfo * rst_info)101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
102   aom_free(rst_info->unit_info);
103   rst_info->unit_info = NULL;
104 }
105 
106 #if 0
107 // Pair of values for each sgrproj parameter:
108 // Index 0 corresponds to r[0], e[0]
109 // Index 1 corresponds to r[1], e[1]
110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
111 
112 static void GenSgrprojVtable() {
113   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114     const sgr_params_type *const params = &sgr_params[i];
115     for (int j = 0; j < 2; ++j) {
116       const int e = params->e[j];
117       const int r = params->r[j];
118       if (r == 0) {                 // filter is disabled
119         sgrproj_mtable[i][j] = -1;  // mark invalid
120       } else {                      // filter is enabled
121         const int n = (2 * r + 1) * (2 * r + 1);
122         const int n2e = n * n * e;
123         assert(n2e != 0);
124         sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125       }
126     }
127   }
128 }
129 #endif
130 
av1_loop_restoration_precal()131 void av1_loop_restoration_precal() {
132 #if 0
133   GenSgrprojVtable();
134 #endif
135 }
136 
extend_frame_lowbd(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138                                int border_horz, int border_vert) {
139   uint8_t *data_p;
140   int i;
141   for (i = 0; i < height; ++i) {
142     data_p = data + i * stride;
143     memset(data_p - border_horz, data_p[0], border_horz);
144     memset(data_p + width, data_p[width - 1], border_horz);
145   }
146   data_p = data - border_horz;
147   for (i = -border_vert; i < 0; ++i) {
148     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149   }
150   for (i = height; i < height + border_vert; ++i) {
151     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152            width + 2 * border_horz);
153   }
154 }
155 
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)156 static void extend_frame_highbd(uint16_t *data, int width, int height,
157                                 int stride, int border_horz, int border_vert) {
158   uint16_t *data_p;
159   int i, j;
160   for (i = 0; i < height; ++i) {
161     data_p = data + i * stride;
162     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
163     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
164   }
165   data_p = data - border_horz;
166   for (i = -border_vert; i < 0; ++i) {
167     memcpy(data_p + i * stride, data_p,
168            (width + 2 * border_horz) * sizeof(uint16_t));
169   }
170   for (i = height; i < height + border_vert; ++i) {
171     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
172            (width + 2 * border_horz) * sizeof(uint16_t));
173   }
174 }
175 
extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)176 void extend_frame(uint8_t *data, int width, int height, int stride,
177                   int border_horz, int border_vert, int highbd) {
178   if (highbd)
179     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
180                         border_horz, border_vert);
181   else
182     extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183 }
184 
copy_tile_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)185 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
186                             int src_stride, uint8_t *dst, int dst_stride) {
187   for (int i = 0; i < height; ++i)
188     memcpy(dst + i * dst_stride, src + i * src_stride, width);
189 }
190 
copy_tile_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)191 static void copy_tile_highbd(int width, int height, const uint16_t *src,
192                              int src_stride, uint16_t *dst, int dst_stride) {
193   for (int i = 0; i < height; ++i)
194     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
195 }
196 
copy_tile(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)197 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
198                       uint8_t *dst, int dst_stride, int highbd) {
199   if (highbd)
200     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
201                      CONVERT_TO_SHORTPTR(dst), dst_stride);
202   else
203     copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
204 }
205 
206 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
207 
208 // With striped loop restoration, the filtering for each 64-pixel stripe gets
209 // most of its input from the output of CDEF (stored in data8), but we need to
210 // fill out a border of 3 pixels above/below the stripe according to the
211 // following
212 // rules:
213 //
214 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
215 //   This extension is done by a call to extend_frame() at the start of the loop
216 //   restoration process, so the value of copy_above/copy_below doesn't strictly
217 //   matter.
218 //   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
219 //   across tiles is disabled, we can allow
220 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
221 //   data has always been copied, simplifying the behaviour at the left and
222 //   right edges of tiles.
223 //
224 // * If we're at a tile boundary and loop filtering across tiles is enabled,
225 //   then there is a logical stripe which is 64 pixels high, but which is split
226 //   into an 8px high and a 56px high stripe so that the processing (and
227 //   coefficient set usage) can be aligned to tiles.
228 //   In this case, we use the 3 rows of CDEF output across the boundary for
229 //   context; this corresponds to leaving the frame buffer as-is.
230 //
231 // * If we're at a tile boundary and loop filtering across tiles is disabled,
232 //   then we take the outermost row of CDEF pixels *within the current tile*
233 //   and copy it three times. Thus we behave exactly as if the tile were a full
234 //   frame.
235 //
236 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
237 //   take 2 rows of deblocked pixels and extend them to 3 rows of context.
238 //
239 // The distinction between the latter two cases is handled by the
240 // av1_loop_restoration_save_boundary_lines() function, so here we just need
241 // to decide if we're overwriting the above/below boundary pixels or not.
get_stripe_boundary_info(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int ss_y,int * copy_above,int * copy_below)242 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
243                                      const AV1PixelRect *tile_rect, int ss_y,
244                                      int *copy_above, int *copy_below) {
245   *copy_above = 1;
246   *copy_below = 1;
247 
248   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
249   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
250 
251   const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
252   const int this_stripe_height =
253       full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
254   const int last_stripe_in_tile =
255       (limits->v_start + this_stripe_height >= tile_rect->bottom);
256 
257   if (first_stripe_in_tile) *copy_above = 0;
258   if (last_stripe_in_tile) *copy_below = 0;
259 }
260 
261 // Overwrite the border pixels around a processing stripe so that the conditions
262 // listed above get_stripe_boundary_info() are preserved.
263 // We save the pixels which get overwritten into a temporary buffer, so that
264 // they can be restored by restore_processing_stripe_boundary() after we've
265 // processed the stripe.
266 //
267 // limits gives the rectangular limits of the remaining stripes for the current
268 // restoration unit. rsb is the stored stripe boundaries (taken from either
269 // deblock or CDEF output as necessary).
270 //
271 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
272 // the first stripe in this tile (needed to convert the tile-relative stripe
273 // index we get from limits into something we can look up in rsb).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)274 static void setup_processing_stripe_boundary(
275     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
276     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
277     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
278   // Offsets within the line buffers. The buffer logically starts at column
279   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
280   // has column x0 in the buffer.
281   const int buf_stride = rsb->stripe_boundary_stride;
282   const int buf_x0_off = limits->h_start;
283   const int line_width =
284       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
285   const int line_size = line_width << use_highbd;
286 
287   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
288 
289   // Replace RESTORATION_BORDER pixels above the top of the stripe
290   // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
291   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
292   // duplicating the topmost of the 2 lines (see the AOMMAX call when
293   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
294   //
295   // Special case: If we're at the top of a tile, which isn't on the topmost
296   // tile row, and we're allowed to loop filter across tiles, then we have a
297   // logical 64-pixel-high stripe which has been split into an 8-pixel high
298   // stripe and a 56-pixel high stripe (the current one). So, in this case,
299   // we want to leave the boundary alone!
300   if (!opt) {
301     if (copy_above) {
302       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
303 
304       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
305         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
306         const int buf_off = buf_x0_off + buf_row * buf_stride;
307         const uint8_t *buf =
308             rsb->stripe_boundary_above + (buf_off << use_highbd);
309         uint8_t *dst8 = data8_tl + i * data_stride;
310         // Save old pixels, then replace with data from stripe_boundary_above
311         memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
312                REAL_PTR(use_highbd, dst8), line_size);
313         memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
314       }
315     }
316 
317     // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
318     // The second buffer row is repeated, so src_row gets the values 0, 1, 1
319     // for i = 0, 1, 2.
320     if (copy_below) {
321       const int stripe_end = limits->v_start + h;
322       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
323 
324       for (int i = 0; i < RESTORATION_BORDER; ++i) {
325         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
326         const int buf_off = buf_x0_off + buf_row * buf_stride;
327         const uint8_t *src =
328             rsb->stripe_boundary_below + (buf_off << use_highbd);
329 
330         uint8_t *dst8 = data8_bl + i * data_stride;
331         // Save old pixels, then replace with data from stripe_boundary_below
332         memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
333         memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
334       }
335     }
336   } else {
337     if (copy_above) {
338       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
339 
340       // Only save and overwrite i=-RESTORATION_BORDER line.
341       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
342       // Save old pixels, then replace with data from stripe_boundary_above
343       memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
344       memcpy(REAL_PTR(use_highbd, dst8),
345              REAL_PTR(use_highbd,
346                       data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
347              line_size);
348     }
349 
350     if (copy_below) {
351       const int stripe_end = limits->v_start + h;
352       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
353 
354       // Only save and overwrite i=2 line.
355       uint8_t *dst8 = data8_bl + 2 * data_stride;
356       // Save old pixels, then replace with data from stripe_boundary_below
357       memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
358       memcpy(REAL_PTR(use_highbd, dst8),
359              REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
360     }
361   }
362 }
363 
364 // This function restores the boundary lines modified by
365 // setup_processing_stripe_boundary.
366 //
367 // Note: We need to be careful when handling the corners of the processing
368 // unit, because (eg.) the top-left corner is considered to be part of
369 // both the left and top borders. This means that, depending on the
370 // loop_filter_across_tiles_enabled flag, the corner pixels might get
371 // overwritten twice, once as part of the "top" border and once as part
372 // of the "left" border (or similar for other corners).
373 //
374 // Everything works out fine as long as we make sure to reverse the order
375 // when restoring, ie. we need to restore the left/right borders followed
376 // by the top/bottom borders.
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)377 static void restore_processing_stripe_boundary(
378     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
379     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
380     int copy_below, int opt) {
381   const int line_width =
382       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
383   const int line_size = line_width << use_highbd;
384 
385   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
386 
387   if (!opt) {
388     if (copy_above) {
389       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
390       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
391         uint8_t *dst8 = data8_tl + i * data_stride;
392         memcpy(REAL_PTR(use_highbd, dst8),
393                rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
394       }
395     }
396 
397     if (copy_below) {
398       const int stripe_bottom = limits->v_start + h;
399       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
400 
401       for (int i = 0; i < RESTORATION_BORDER; ++i) {
402         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
403 
404         uint8_t *dst8 = data8_bl + i * data_stride;
405         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
406       }
407     }
408   } else {
409     if (copy_above) {
410       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
411 
412       // Only restore i=-RESTORATION_BORDER line.
413       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
414       memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
415     }
416 
417     if (copy_below) {
418       const int stripe_bottom = limits->v_start + h;
419       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
420 
421       // Only restore i=2 line.
422       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
423         uint8_t *dst8 = data8_bl + 2 * data_stride;
424         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
425       }
426     }
427   }
428 }
429 
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)430 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
431                                  int stripe_width, int stripe_height,
432                                  int procunit_width, const uint8_t *src,
433                                  int src_stride, uint8_t *dst, int dst_stride,
434                                  int32_t *tmpbuf, int bit_depth) {
435   (void)tmpbuf;
436   (void)bit_depth;
437   assert(bit_depth == 8);
438   const ConvolveParams conv_params = get_conv_params_wiener(8);
439 
440   for (int j = 0; j < stripe_width; j += procunit_width) {
441     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
442     const uint8_t *src_p = src + j;
443     uint8_t *dst_p = dst + j;
444     av1_wiener_convolve_add_src(
445         src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
446         rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
447   }
448 }
449 
450 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
451    over the input. The window is of size (2r + 1)x(2r + 1), and we
452    specialize to r = 1, 2, 3. A default function is used for r > 3.
453 
454    Each loop follows the same format: We keep a window's worth of input
455    in individual variables and select data out of that as appropriate.
456 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)457 static void boxsum1(int32_t *src, int width, int height, int src_stride,
458                     int sqr, int32_t *dst, int dst_stride) {
459   int i, j, a, b, c;
460   assert(width > 2 * SGRPROJ_BORDER_HORZ);
461   assert(height > 2 * SGRPROJ_BORDER_VERT);
462 
463   // Vertical sum over 3-pixel regions, from src into dst.
464   if (!sqr) {
465     for (j = 0; j < width; ++j) {
466       a = src[j];
467       b = src[src_stride + j];
468       c = src[2 * src_stride + j];
469 
470       dst[j] = a + b;
471       for (i = 1; i < height - 2; ++i) {
472         // Loop invariant: At the start of each iteration,
473         // a = src[(i - 1) * src_stride + j]
474         // b = src[(i    ) * src_stride + j]
475         // c = src[(i + 1) * src_stride + j]
476         dst[i * dst_stride + j] = a + b + c;
477         a = b;
478         b = c;
479         c = src[(i + 2) * src_stride + j];
480       }
481       dst[i * dst_stride + j] = a + b + c;
482       dst[(i + 1) * dst_stride + j] = b + c;
483     }
484   } else {
485     for (j = 0; j < width; ++j) {
486       a = src[j] * src[j];
487       b = src[src_stride + j] * src[src_stride + j];
488       c = src[2 * src_stride + j] * src[2 * src_stride + j];
489 
490       dst[j] = a + b;
491       for (i = 1; i < height - 2; ++i) {
492         dst[i * dst_stride + j] = a + b + c;
493         a = b;
494         b = c;
495         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
496       }
497       dst[i * dst_stride + j] = a + b + c;
498       dst[(i + 1) * dst_stride + j] = b + c;
499     }
500   }
501 
502   // Horizontal sum over 3-pixel regions of dst
503   for (i = 0; i < height; ++i) {
504     a = dst[i * dst_stride];
505     b = dst[i * dst_stride + 1];
506     c = dst[i * dst_stride + 2];
507 
508     dst[i * dst_stride] = a + b;
509     for (j = 1; j < width - 2; ++j) {
510       // Loop invariant: At the start of each iteration,
511       // a = src[i * src_stride + (j - 1)]
512       // b = src[i * src_stride + (j    )]
513       // c = src[i * src_stride + (j + 1)]
514       dst[i * dst_stride + j] = a + b + c;
515       a = b;
516       b = c;
517       c = dst[i * dst_stride + (j + 2)];
518     }
519     dst[i * dst_stride + j] = a + b + c;
520     dst[i * dst_stride + (j + 1)] = b + c;
521   }
522 }
523 
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)524 static void boxsum2(int32_t *src, int width, int height, int src_stride,
525                     int sqr, int32_t *dst, int dst_stride) {
526   int i, j, a, b, c, d, e;
527   assert(width > 2 * SGRPROJ_BORDER_HORZ);
528   assert(height > 2 * SGRPROJ_BORDER_VERT);
529 
530   // Vertical sum over 5-pixel regions, from src into dst.
531   if (!sqr) {
532     for (j = 0; j < width; ++j) {
533       a = src[j];
534       b = src[src_stride + j];
535       c = src[2 * src_stride + j];
536       d = src[3 * src_stride + j];
537       e = src[4 * src_stride + j];
538 
539       dst[j] = a + b + c;
540       dst[dst_stride + j] = a + b + c + d;
541       for (i = 2; i < height - 3; ++i) {
542         // Loop invariant: At the start of each iteration,
543         // a = src[(i - 2) * src_stride + j]
544         // b = src[(i - 1) * src_stride + j]
545         // c = src[(i    ) * src_stride + j]
546         // d = src[(i + 1) * src_stride + j]
547         // e = src[(i + 2) * src_stride + j]
548         dst[i * dst_stride + j] = a + b + c + d + e;
549         a = b;
550         b = c;
551         c = d;
552         d = e;
553         e = src[(i + 3) * src_stride + j];
554       }
555       dst[i * dst_stride + j] = a + b + c + d + e;
556       dst[(i + 1) * dst_stride + j] = b + c + d + e;
557       dst[(i + 2) * dst_stride + j] = c + d + e;
558     }
559   } else {
560     for (j = 0; j < width; ++j) {
561       a = src[j] * src[j];
562       b = src[src_stride + j] * src[src_stride + j];
563       c = src[2 * src_stride + j] * src[2 * src_stride + j];
564       d = src[3 * src_stride + j] * src[3 * src_stride + j];
565       e = src[4 * src_stride + j] * src[4 * src_stride + j];
566 
567       dst[j] = a + b + c;
568       dst[dst_stride + j] = a + b + c + d;
569       for (i = 2; i < height - 3; ++i) {
570         dst[i * dst_stride + j] = a + b + c + d + e;
571         a = b;
572         b = c;
573         c = d;
574         d = e;
575         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
576       }
577       dst[i * dst_stride + j] = a + b + c + d + e;
578       dst[(i + 1) * dst_stride + j] = b + c + d + e;
579       dst[(i + 2) * dst_stride + j] = c + d + e;
580     }
581   }
582 
583   // Horizontal sum over 5-pixel regions of dst
584   for (i = 0; i < height; ++i) {
585     a = dst[i * dst_stride];
586     b = dst[i * dst_stride + 1];
587     c = dst[i * dst_stride + 2];
588     d = dst[i * dst_stride + 3];
589     e = dst[i * dst_stride + 4];
590 
591     dst[i * dst_stride] = a + b + c;
592     dst[i * dst_stride + 1] = a + b + c + d;
593     for (j = 2; j < width - 3; ++j) {
594       // Loop invariant: At the start of each iteration,
595       // a = src[i * src_stride + (j - 2)]
596       // b = src[i * src_stride + (j - 1)]
597       // c = src[i * src_stride + (j    )]
598       // d = src[i * src_stride + (j + 1)]
599       // e = src[i * src_stride + (j + 2)]
600       dst[i * dst_stride + j] = a + b + c + d + e;
601       a = b;
602       b = c;
603       c = d;
604       d = e;
605       e = dst[i * dst_stride + (j + 3)];
606     }
607     dst[i * dst_stride + j] = a + b + c + d + e;
608     dst[i * dst_stride + (j + 1)] = b + c + d + e;
609     dst[i * dst_stride + (j + 2)] = c + d + e;
610   }
611 }
612 
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)613 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
614                    int sqr, int32_t *dst, int dst_stride) {
615   if (r == 1)
616     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
617   else if (r == 2)
618     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
619   else
620     assert(0 && "Invalid value of r in self-guided filter");
621 }
622 
decode_xq(const int * xqd,int * xq,const sgr_params_type * params)623 void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
624   if (params->r[0] == 0) {
625     xq[0] = 0;
626     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
627   } else if (params->r[1] == 0) {
628     xq[0] = xqd[0];
629     xq[1] = 0;
630   } else {
631     xq[0] = xqd[0];
632     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
633   }
634 }
635 
636 const int32_t x_by_xplus1[256] = {
637   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
638   // instead of 0. See comments in selfguided_restoration_internal() for why
639   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
640   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
641   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
642   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
643   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
644   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
645   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
646   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
647   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
648   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
649   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
650   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
651   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
652   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
653   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
654   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
655   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
656   256,
657 };
658 
659 const int32_t one_by_x[MAX_NELEM] = {
660   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
661   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
662 };
663 
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)664 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
665                                           int dgd_stride, int bit_depth,
666                                           int sgr_params_idx, int radius_idx,
667                                           int pass, int32_t *A, int32_t *B) {
668   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
669   const int r = params->r[radius_idx];
670   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
671   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
672   // Adjusting the stride of A and B here appears to avoid bad cache effects,
673   // leading to a significant speed improvement.
674   // We also align the stride to a multiple of 16 bytes, for consistency
675   // with the SIMD version of this function.
676   int buf_stride = ((width_ext + 3) & ~3) + 16;
677   const int step = pass == 0 ? 1 : 2;
678   int i, j;
679 
680   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
681   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
682          "Need SGRPROJ_BORDER_* >= r+1");
683 
684   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
685          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
686   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
687          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
688   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
689   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
690   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
691   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
692   for (i = -1; i < height + 1; i += step) {
693     for (j = -1; j < width + 1; ++j) {
694       const int k = i * buf_stride + j;
695       const int n = (2 * r + 1) * (2 * r + 1);
696 
697       // a < 2^16 * n < 2^22 regardless of bit depth
698       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
699       // b < 2^8 * n < 2^14 regardless of bit depth
700       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
701 
702       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
703       // and p itself satisfies p < 2^14 * n^2 < 2^26.
704       // This bound on p is due to:
705       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
706       //
707       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
708       // This is an artefact of rounding, and can only happen if all pixels
709       // are (almost) identical, so in this case we saturate to p=0.
710       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
711 
712       const uint32_t s = params->s[radius_idx];
713 
714       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
715       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
716       // (this holds even after accounting for the rounding in s)
717       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
718 
719       // Note: We have to be quite careful about the value of A[k].
720       // This is used as a blend factor between individual pixel values and the
721       // local mean. So it logically has a range of [0, 256], including both
722       // endpoints.
723       //
724       // This is a pain for hardware, as we'd like something which can be stored
725       // in exactly 8 bits.
726       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
727       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
728       // slightly above 2^(8 + bit depth), due to rounding in the value of
729       // one_by_x[25-1].
730       //
731       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
732       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
733       // overflow), without significantly affecting the final result: z == 0
734       // implies that the image is essentially "flat", so the local mean and
735       // individual pixel values are very similar.
736       //
737       // Note that saturating on the other side, ie. requring A[k] <= 255,
738       // would be a bad idea, as that corresponds to the case where the image
739       // is very variable, when we want to preserve the local pixel value as
740       // much as possible.
741       A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
742 
743       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
744       // one_by_x[n - 1] = round(2^12 / n)
745       // => the product here is < 2^(20 + bit_depth) <= 2^32,
746       // and B[k] is set to a value < 2^(8 + bit depth)
747       // This holds even with the rounding in one_by_x and in the overall
748       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
749       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
750                                              (uint32_t)B[k] *
751                                              (uint32_t)one_by_x[n - 1],
752                                          SGRPROJ_RECIP_BITS);
753     }
754   }
755 }
756 
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)757 static void selfguided_restoration_fast_internal(
758     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
759     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
760   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
761   const int r = params->r[radius_idx];
762   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
763   // Adjusting the stride of A and B here appears to avoid bad cache effects,
764   // leading to a significant speed improvement.
765   // We also align the stride to a multiple of 16 bytes, for consistency
766   // with the SIMD version of this function.
767   int buf_stride = ((width_ext + 3) & ~3) + 16;
768   int32_t A_[RESTORATION_PROC_UNIT_PELS];
769   int32_t B_[RESTORATION_PROC_UNIT_PELS];
770   int32_t *A = A_;
771   int32_t *B = B_;
772   int i, j;
773   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
774                                 sgr_params_idx, radius_idx, 1, A, B);
775   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
776   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
777 
778   // Use the A[] and B[] arrays to calculate the filtered image
779   (void)r;
780   assert(r == 2);
781   for (i = 0; i < height; ++i) {
782     if (!(i & 1)) {  // even row
783       for (j = 0; j < width; ++j) {
784         const int k = i * buf_stride + j;
785         const int l = i * dgd_stride + j;
786         const int m = i * dst_stride + j;
787         const int nb = 5;
788         const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
789                           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
790                            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
791                               5;
792         const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
793                           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
794                            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
795                               5;
796         const int32_t v = a * dgd[l] + b;
797         dst[m] =
798             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
799       }
800     } else {  // odd row
801       for (j = 0; j < width; ++j) {
802         const int k = i * buf_stride + j;
803         const int l = i * dgd_stride + j;
804         const int m = i * dst_stride + j;
805         const int nb = 4;
806         const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
807         const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
808         const int32_t v = a * dgd[l] + b;
809         dst[m] =
810             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
811       }
812     }
813   }
814 }
815 
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)816 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
817                                             int dgd_stride, int32_t *dst,
818                                             int dst_stride, int bit_depth,
819                                             int sgr_params_idx,
820                                             int radius_idx) {
821   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
822   // Adjusting the stride of A and B here appears to avoid bad cache effects,
823   // leading to a significant speed improvement.
824   // We also align the stride to a multiple of 16 bytes, for consistency
825   // with the SIMD version of this function.
826   int buf_stride = ((width_ext + 3) & ~3) + 16;
827   int32_t A_[RESTORATION_PROC_UNIT_PELS];
828   int32_t B_[RESTORATION_PROC_UNIT_PELS];
829   int32_t *A = A_;
830   int32_t *B = B_;
831   int i, j;
832   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
833                                 sgr_params_idx, radius_idx, 0, A, B);
834   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
835   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
836 
837   // Use the A[] and B[] arrays to calculate the filtered image
838   for (i = 0; i < height; ++i) {
839     for (j = 0; j < width; ++j) {
840       const int k = i * buf_stride + j;
841       const int l = i * dgd_stride + j;
842       const int m = i * dst_stride + j;
843       const int nb = 5;
844       const int32_t a =
845           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
846               4 +
847           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
848            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
849               3;
850       const int32_t b =
851           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
852               4 +
853           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
854            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
855               3;
856       const int32_t v = a * dgd[l] + b;
857       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
858     }
859   }
860 }
861 
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)862 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
863                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
864                                  int flt_stride, int sgr_params_idx,
865                                  int bit_depth, int highbd) {
866   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
867   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
868   int32_t *dgd32 =
869       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
870 
871   if (highbd) {
872     const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
873     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
874       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
875         dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
876       }
877     }
878   } else {
879     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
880       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
881         dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
882       }
883     }
884   }
885 
886   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
887   // If params->r == 0 we skip the corresponding filter. We only allow one of
888   // the radii to be 0, as having both equal to 0 would be equivalent to
889   // skipping SGR entirely.
890   assert(!(params->r[0] == 0 && params->r[1] == 0));
891 
892   if (params->r[0] > 0)
893     selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
894                                          flt0, flt_stride, bit_depth,
895                                          sgr_params_idx, 0);
896   if (params->r[1] > 0)
897     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
898                                     flt_stride, bit_depth, sgr_params_idx, 1);
899   return 0;
900 }
901 
apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)902 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
903                                     int stride, int eps, const int *xqd,
904                                     uint8_t *dst8, int dst_stride,
905                                     int32_t *tmpbuf, int bit_depth,
906                                     int highbd) {
907   int32_t *flt0 = tmpbuf;
908   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
909   assert(width * height <= RESTORATION_UNITPELS_MAX);
910 
911   const int ret = av1_selfguided_restoration_c(
912       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
913   (void)ret;
914   assert(!ret);
915   const sgr_params_type *const params = &sgr_params[eps];
916   int xq[2];
917   decode_xq(xqd, xq, params);
918   for (int i = 0; i < height; ++i) {
919     for (int j = 0; j < width; ++j) {
920       const int k = i * width + j;
921       uint8_t *dst8ij = dst8 + i * dst_stride + j;
922       const uint8_t *dat8ij = dat8 + i * stride + j;
923 
924       const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
925       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
926       int32_t v = u << SGRPROJ_PRJ_BITS;
927       // If params->r == 0 then we skipped the filtering in
928       // av1_selfguided_restoration_c, i.e. flt[k] == u
929       if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
930       if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
931       const int16_t w =
932           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
933 
934       const uint16_t out = clip_pixel_highbd(w, bit_depth);
935       if (highbd)
936         *CONVERT_TO_SHORTPTR(dst8ij) = out;
937       else
938         *dst8ij = (uint8_t)out;
939     }
940   }
941 }
942 
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)943 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
944                                   int stripe_width, int stripe_height,
945                                   int procunit_width, const uint8_t *src,
946                                   int src_stride, uint8_t *dst, int dst_stride,
947                                   int32_t *tmpbuf, int bit_depth) {
948   (void)bit_depth;
949   assert(bit_depth == 8);
950 
951   for (int j = 0; j < stripe_width; j += procunit_width) {
952     int w = AOMMIN(procunit_width, stripe_width - j);
953     apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
954                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
955                                  dst + j, dst_stride, tmpbuf, bit_depth, 0);
956   }
957 }
958 
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)959 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
960                                         int stripe_width, int stripe_height,
961                                         int procunit_width, const uint8_t *src8,
962                                         int src_stride, uint8_t *dst8,
963                                         int dst_stride, int32_t *tmpbuf,
964                                         int bit_depth) {
965   (void)tmpbuf;
966   const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
967 
968   for (int j = 0; j < stripe_width; j += procunit_width) {
969     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
970     const uint8_t *src8_p = src8 + j;
971     uint8_t *dst8_p = dst8 + j;
972     av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
973                                        rui->wiener_info.hfilter, 16,
974                                        rui->wiener_info.vfilter, 16, w,
975                                        stripe_height, &conv_params, bit_depth);
976   }
977 }
978 
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)979 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
980                                          int stripe_width, int stripe_height,
981                                          int procunit_width,
982                                          const uint8_t *src8, int src_stride,
983                                          uint8_t *dst8, int dst_stride,
984                                          int32_t *tmpbuf, int bit_depth) {
985   for (int j = 0; j < stripe_width; j += procunit_width) {
986     int w = AOMMIN(procunit_width, stripe_width - j);
987     apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
988                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
989                                  dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
990   }
991 }
992 
993 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
994                                   int stripe_width, int stripe_height,
995                                   int procunit_width, const uint8_t *src,
996                                   int src_stride, uint8_t *dst, int dst_stride,
997                                   int32_t *tmpbuf, int bit_depth);
998 
999 #define NUM_STRIPE_FILTERS 4
1000 
1001 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1002   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1003   sgrproj_filter_stripe_highbd
1004 };
1005 
1006 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,const AV1PixelRect * tile_rect,int tile_stripe0,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr)1007 void av1_loop_restoration_filter_unit(
1008     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1009     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1010     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1011     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1012     int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1013   RestorationType unit_rtype = rui->restoration_type;
1014 
1015   int unit_h = limits->v_end - limits->v_start;
1016   int unit_w = limits->h_end - limits->h_start;
1017   uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1018   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1019 
1020   if (unit_rtype == RESTORE_NONE) {
1021     copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1022     return;
1023   }
1024 
1025   const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1026   assert(filter_idx < NUM_STRIPE_FILTERS);
1027   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1028 
1029   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1030 
1031   // Convolve the whole tile one stripe at a time
1032   RestorationTileLimits remaining_stripes = *limits;
1033   int i = 0;
1034   while (i < unit_h) {
1035     int copy_above, copy_below;
1036     remaining_stripes.v_start = limits->v_start + i;
1037 
1038     get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1039                              &copy_below);
1040 
1041     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1042     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1043 
1044     // Work out where this stripe's boundaries are within
1045     // rsb->stripe_boundary_{above,below}
1046     const int tile_stripe =
1047         (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1048         full_stripe_height;
1049     const int frame_stripe = tile_stripe0 + tile_stripe;
1050     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1051 
1052     // Calculate this stripe's height, based on two rules:
1053     // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1054     // * We can't extend past the end of the current restoration unit
1055     const int nominal_stripe_height =
1056         full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1057     const int h = AOMMIN(nominal_stripe_height,
1058                          remaining_stripes.v_end - remaining_stripes.v_start);
1059 
1060     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1061                                      h, data8, stride, rlbs, copy_above,
1062                                      copy_below, optimized_lr);
1063 
1064     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1065                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1066 
1067     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1068                                        data8, stride, copy_above, copy_below,
1069                                        optimized_lr);
1070 
1071     i += h;
1072   }
1073 }
1074 
filter_frame_on_unit(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1075 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1076                                  const AV1PixelRect *tile_rect,
1077                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
1078                                  RestorationLineBuffers *rlbs) {
1079   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1080   const RestorationInfo *rsi = ctxt->rsi;
1081 
1082   av1_loop_restoration_filter_unit(
1083       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1084       ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1085       ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1086       rsi->optimized_lr);
1087 }
1088 
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1089 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1090                                             YV12_BUFFER_CONFIG *frame,
1091                                             AV1_COMMON *cm, int optimized_lr,
1092                                             int num_planes) {
1093   const SequenceHeader *const seq_params = &cm->seq_params;
1094   const int bit_depth = seq_params->bit_depth;
1095   const int highbd = seq_params->use_highbitdepth;
1096   lr_ctxt->dst = &cm->rst_frame;
1097 
1098   const int frame_width = frame->crop_widths[0];
1099   const int frame_height = frame->crop_heights[0];
1100   if (aom_realloc_frame_buffer(
1101           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1102           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1103           cm->byte_alignment, NULL, NULL, NULL) < 0)
1104     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1105                        "Failed to allocate restoration dst buffer");
1106 
1107   lr_ctxt->on_rest_unit = filter_frame_on_unit;
1108   lr_ctxt->frame = frame;
1109   for (int plane = 0; plane < num_planes; ++plane) {
1110     RestorationInfo *rsi = &cm->rst_info[plane];
1111     RestorationType rtype = rsi->frame_restoration_type;
1112     rsi->optimized_lr = optimized_lr;
1113 
1114     if (rtype == RESTORE_NONE) {
1115       continue;
1116     }
1117 
1118     const int is_uv = plane > 0;
1119     const int plane_width = frame->crop_widths[is_uv];
1120     const int plane_height = frame->crop_heights[is_uv];
1121     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1122 
1123     extend_frame(frame->buffers[plane], plane_width, plane_height,
1124                  frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
1125                  highbd);
1126 
1127     lr_plane_ctxt->rsi = rsi;
1128     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1129     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1130     lr_plane_ctxt->highbd = highbd;
1131     lr_plane_ctxt->bit_depth = bit_depth;
1132     lr_plane_ctxt->data8 = frame->buffers[plane];
1133     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1134     lr_plane_ctxt->data_stride = frame->strides[is_uv];
1135     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1136     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1137     lr_plane_ctxt->tile_stripe0 = 0;
1138   }
1139 }
1140 
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1141 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1142                                       AV1_COMMON *cm, int num_planes) {
1143   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1144                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1145                            int vstart, int vend);
1146   static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1147                                          aom_yv12_partial_coloc_copy_u,
1148                                          aom_yv12_partial_coloc_copy_v };
1149 
1150   for (int plane = 0; plane < num_planes; ++plane) {
1151     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1152     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1153     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1154                      tile_rect.right, tile_rect.top, tile_rect.bottom);
1155   }
1156 }
1157 
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1158 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1159                                         int num_planes) {
1160   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1161 
1162   for (int plane = 0; plane < num_planes; ++plane) {
1163     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1164       continue;
1165     }
1166 
1167     av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1168                                    &ctxt[plane], &ctxt[plane].tile_rect,
1169                                    cm->rst_tmpbuf, cm->rlbs);
1170   }
1171 }
1172 
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1173 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1174                                        AV1_COMMON *cm, int optimized_lr,
1175                                        void *lr_ctxt) {
1176   assert(!cm->all_lossless);
1177   const int num_planes = av1_num_planes(cm);
1178 
1179   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1180 
1181   av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1182                                          optimized_lr, num_planes);
1183 
1184   foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1185 
1186   av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1187 }
1188 
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,const AV1PixelRect * tile_rect,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int unit_idx0,int hunits_per_tile,int vunits_per_tile,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync)1189 void av1_foreach_rest_unit_in_row(
1190     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1191     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1192     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1193     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1194     sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1195     struct AV1LrSyncData *const lr_sync) {
1196   const int tile_w = tile_rect->right - tile_rect->left;
1197   const int ext_size = unit_size * 3 / 2;
1198   int x0 = 0, j = 0;
1199   while (x0 < tile_w) {
1200     int remaining_w = tile_w - x0;
1201     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1202 
1203     limits->h_start = tile_rect->left + x0;
1204     limits->h_end = tile_rect->left + x0 + w;
1205     assert(limits->h_end <= tile_rect->right);
1206 
1207     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1208 
1209     // No sync for even numbered rows
1210     // For odd numbered rows, Loop Restoration of current block requires the LR
1211     // of top-right and bottom-right blocks to be completed
1212 
1213     // top-right sync
1214     on_sync_read(lr_sync, row_number, j, plane);
1215     if ((row_number + 1) < vunits_per_tile)
1216       // bottom-right sync
1217       on_sync_read(lr_sync, row_number + 2, j, plane);
1218 
1219     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1220 
1221     on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1222 
1223     x0 += w;
1224     ++j;
1225   }
1226 }
1227 
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1228 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1229   (void)lr_sync;
1230   (void)r;
1231   (void)c;
1232   (void)plane;
1233 }
1234 
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1235 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1236                              const int sb_cols, int plane) {
1237   (void)lr_sync;
1238   (void)r;
1239   (void)c;
1240   (void)sb_cols;
1241   (void)plane;
1242 }
1243 
foreach_rest_unit_in_tile(const AV1PixelRect * tile_rect,int tile_row,int tile_col,int tile_cols,int hunits_per_tile,int vunits_per_tile,int units_per_tile,int unit_size,int ss_y,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1244 static void foreach_rest_unit_in_tile(
1245     const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1246     int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1247     int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1248     int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1249   const int tile_h = tile_rect->bottom - tile_rect->top;
1250   const int ext_size = unit_size * 3 / 2;
1251 
1252   const int tile_idx = tile_col + tile_row * tile_cols;
1253   const int unit_idx0 = tile_idx * units_per_tile;
1254 
1255   int y0 = 0, i = 0;
1256   while (y0 < tile_h) {
1257     int remaining_h = tile_h - y0;
1258     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1259 
1260     RestorationTileLimits limits;
1261     limits.v_start = tile_rect->top + y0;
1262     limits.v_end = tile_rect->top + y0 + h;
1263     assert(limits.v_end <= tile_rect->bottom);
1264     // Offset the tile upwards to align with the restoration processing stripe
1265     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1266     limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1267     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1268 
1269     av1_foreach_rest_unit_in_row(
1270         &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1271         hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1272         av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1273 
1274     y0 += h;
1275     ++i;
1276   }
1277 }
1278 
av1_foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,AV1PixelRect * tile_rect,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1279 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1280                                     rest_unit_visitor_t on_rest_unit,
1281                                     void *priv, AV1PixelRect *tile_rect,
1282                                     int32_t *tmpbuf,
1283                                     RestorationLineBuffers *rlbs) {
1284   const int is_uv = plane > 0;
1285   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1286 
1287   const RestorationInfo *rsi = &cm->rst_info[plane];
1288 
1289   foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1290                             rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1291                             rsi->units_per_tile, rsi->restoration_unit_size,
1292                             ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1293 }
1294 
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1295 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1296                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
1297                                        int *rcol0, int *rcol1, int *rrow0,
1298                                        int *rrow1) {
1299   assert(rcol0 && rcol1 && rrow0 && rrow1);
1300 
1301   if (bsize != cm->seq_params.sb_size) return 0;
1302   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1303 
1304   assert(!cm->all_lossless);
1305 
1306   const int is_uv = plane > 0;
1307 
1308   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1309   const int tile_w = tile_rect.right - tile_rect.left;
1310   const int tile_h = tile_rect.bottom - tile_rect.top;
1311 
1312   const int mi_top = 0;
1313   const int mi_left = 0;
1314 
1315   // Compute the mi-unit corners of the superblock relative to the top-left of
1316   // the tile
1317   const int mi_rel_row0 = mi_row - mi_top;
1318   const int mi_rel_col0 = mi_col - mi_left;
1319   const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1320   const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1321 
1322   const RestorationInfo *rsi = &cm->rst_info[plane];
1323   const int size = rsi->restoration_unit_size;
1324 
1325   // Calculate the number of restoration units in this tile (which might be
1326   // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1327   const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1328   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1329 
1330   // The size of an MI-unit on this plane of the image
1331   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1332   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1333   const int mi_size_x = MI_SIZE >> ss_x;
1334   const int mi_size_y = MI_SIZE >> ss_y;
1335 
1336   // Write m for the relative mi column or row, D for the superres denominator
1337   // and N for the superres numerator. If u is the upscaled pixel offset then
1338   // we can write the downscaled pixel offset in two ways as:
1339   //
1340   //   MI_SIZE * m = N / D u
1341   //
1342   // from which we get u = D * MI_SIZE * m / N
1343   const int mi_to_num_x = av1_superres_scaled(cm)
1344                               ? mi_size_x * cm->superres_scale_denominator
1345                               : mi_size_x;
1346   const int mi_to_num_y = mi_size_y;
1347   const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1348   const int denom_y = size;
1349 
1350   const int rnd_x = denom_x - 1;
1351   const int rnd_y = denom_y - 1;
1352 
1353   // rcol0/rrow0 should be the first column/row of restoration units (relative
1354   // to the top-left of the tile) that doesn't start left/below of
1355   // mi_col/mi_row. For this calculation, we need to round up the division (if
1356   // the sb starts at runit column 10.1, the first matching runit has column
1357   // index 11)
1358   *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1359   *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1360 
1361   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1362   // below-right. If we're at the bottom or right of the tile, this restoration
1363   // unit might not exist, in which case we'll clamp accordingly.
1364   *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1365   *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1366 
1367   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1368 }
1369 
1370 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1371 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1372                          int extend, int use_highbitdepth) {
1373   for (int i = 0; i < height; ++i) {
1374     if (use_highbitdepth) {
1375       uint16_t *buf16 = (uint16_t *)buf;
1376       aom_memset16(buf16 - extend, buf16[0], extend);
1377       aom_memset16(buf16 + width, buf16[width - 1], extend);
1378     } else {
1379       memset(buf - extend, buf[0], extend);
1380       memset(buf + width, buf[width - 1], extend);
1381     }
1382     buf += stride;
1383   }
1384 }
1385 
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1386 static void save_deblock_boundary_lines(
1387     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1388     int stripe, int use_highbd, int is_above,
1389     RestorationStripeBoundaries *boundaries) {
1390   const int is_uv = plane > 0;
1391   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1392   const int src_stride = frame->strides[is_uv] << use_highbd;
1393   const uint8_t *src_rows = src_buf + row * src_stride;
1394 
1395   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1396                                : boundaries->stripe_boundary_below;
1397   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1398   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1399   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1400 
1401   // There is a rare case in which a processing stripe can end 1px above the
1402   // crop border. In this case, we do want to use deblocked pixels from below
1403   // the stripe (hence why we ended up in this function), but instead of
1404   // fetching 2 "below" rows we need to fetch one and duplicate it.
1405   // This is equivalent to clamping the sample locations against the crop border
1406   const int lines_to_save =
1407       AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1408   assert(lines_to_save == 1 || lines_to_save == 2);
1409 
1410   int upscaled_width;
1411   int line_bytes;
1412   if (av1_superres_scaled(cm)) {
1413     const int ss_x = is_uv && cm->seq_params.subsampling_x;
1414     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1415     line_bytes = upscaled_width << use_highbd;
1416     if (use_highbd)
1417       av1_upscale_normative_rows(
1418           cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1419           CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1420           plane, lines_to_save);
1421     else
1422       av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1423                                  boundaries->stripe_boundary_stride, plane,
1424                                  lines_to_save);
1425   } else {
1426     upscaled_width = frame->crop_widths[is_uv];
1427     line_bytes = upscaled_width << use_highbd;
1428     for (int i = 0; i < lines_to_save; i++) {
1429       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1430              line_bytes);
1431     }
1432   }
1433   // If we only saved one line, then copy it into the second line buffer
1434   if (lines_to_save == 1)
1435     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1436 
1437   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1438                RESTORATION_EXTRA_HORZ, use_highbd);
1439 }
1440 
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1441 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1442                                      const AV1_COMMON *cm, int plane, int row,
1443                                      int stripe, int use_highbd, int is_above,
1444                                      RestorationStripeBoundaries *boundaries) {
1445   const int is_uv = plane > 0;
1446   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1447   const int src_stride = frame->strides[is_uv] << use_highbd;
1448   const uint8_t *src_rows = src_buf + row * src_stride;
1449 
1450   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1451                                : boundaries->stripe_boundary_below;
1452   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1453   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1454   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1455   const int src_width = frame->crop_widths[is_uv];
1456 
1457   // At the point where this function is called, we've already applied
1458   // superres. So we don't need to extend the lines here, we can just
1459   // pull directly from the topmost row of the upscaled frame.
1460   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1461   const int upscaled_width = av1_superres_scaled(cm)
1462                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
1463                                  : src_width;
1464   const int line_bytes = upscaled_width << use_highbd;
1465   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1466     // Copy the line at 'row' into both context lines. This is because
1467     // we want to (effectively) extend the outermost row of CDEF data
1468     // from this tile to produce a border, rather than using deblocked
1469     // pixels from the tile above/below.
1470     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1471   }
1472   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1473                RESTORATION_EXTRA_HORZ, use_highbd);
1474 }
1475 
save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1476 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1477                                          int use_highbd, int plane,
1478                                          AV1_COMMON *cm, int after_cdef) {
1479   const int is_uv = plane > 0;
1480   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1481   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1482   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1483 
1484   // Get the tile rectangle, with height rounded up to the next multiple of 8
1485   // luma pixels (only relevant for the bottom tile of the frame)
1486   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1487   const int stripe0 = 0;
1488 
1489   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1490 
1491   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1492 
1493   int tile_stripe;
1494   for (tile_stripe = 0;; ++tile_stripe) {
1495     const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1496     const int y0 = tile_rect.top + rel_y0;
1497     if (y0 >= tile_rect.bottom) break;
1498 
1499     const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1500     const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1501 
1502     const int frame_stripe = stripe0 + tile_stripe;
1503 
1504     // In this case, we should only use CDEF pixels at the top
1505     // and bottom of the frame as a whole; internal tile boundaries
1506     // can use deblocked pixels from adjacent tiles for context.
1507     const int use_deblock_above = (frame_stripe > 0);
1508     const int use_deblock_below = (y1 < plane_height);
1509 
1510     if (!after_cdef) {
1511       // Save deblocked context where needed.
1512       if (use_deblock_above) {
1513         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1514                                     frame_stripe, use_highbd, 1, boundaries);
1515       }
1516       if (use_deblock_below) {
1517         save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1518                                     use_highbd, 0, boundaries);
1519       }
1520     } else {
1521       // Save CDEF context where needed. Note that we need to save the CDEF
1522       // context for a particular boundary iff we *didn't* save deblocked
1523       // context for that boundary.
1524       //
1525       // In addition, we need to save copies of the outermost line within
1526       // the tile, rather than using data from outside the tile.
1527       if (!use_deblock_above) {
1528         save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1529                                  1, boundaries);
1530       }
1531       if (!use_deblock_below) {
1532         save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1533                                  use_highbd, 0, boundaries);
1534       }
1535     }
1536   }
1537 }
1538 
1539 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1540 // lines to be used as boundary in the loop restoration process. The
1541 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1542 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1543                                               AV1_COMMON *cm, int after_cdef) {
1544   const int num_planes = av1_num_planes(cm);
1545   const int use_highbd = cm->seq_params.use_highbitdepth;
1546   for (int p = 0; p < num_planes; ++p) {
1547     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1548   }
1549 }
1550