1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/loop_restoration.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "src/dsp/common.h"
24 #include "src/dsp/dsp.h"
25 #include "src/utils/common.h"
26 #include "src/utils/constants.h"
27 
28 namespace libgav1 {
29 namespace dsp {
30 
31 // Section 7.17.3.
32 // a2: range [1, 256].
33 // if (z >= 255)
34 //   a2 = 256;
35 // else if (z == 0)
36 //   a2 = 1;
37 // else
38 //   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
39 // ma = 256 - a2;
40 alignas(16) const uint8_t kSgrMaLookup[256] = {
41     255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
42     13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
43     7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
44     5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
45     4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
46     3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
47     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
48     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
49     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
50     2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
54     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
55     1,   1,   1,  0};
56 
57 namespace {
58 
59 template <int bitdepth, typename Pixel>
WienerHorizontal(const Pixel * source,const ptrdiff_t source_stride,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,int16_t ** wiener_buffer)60 inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
61                              const int width, const int height,
62                              const int16_t* const filter,
63                              const int number_zero_coefficients,
64                              int16_t** wiener_buffer) {
65   constexpr int kCenterTap = kWienerFilterTaps / 2;
66   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
67                                            ? kInterRoundBitsHorizontal12bpp
68                                            : kInterRoundBitsHorizontal;
69   constexpr int offset =
70       1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
71   constexpr int limit = (offset << 2) - 1;
72   for (int y = 0; y < height; ++y) {
73     int x = 0;
74     do {
75       // sum fits into 16 bits only when bitdepth = 8.
76       int sum = 0;
77       for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
78         sum +=
79             filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
80       }
81       sum += filter[kCenterTap] * source[x + kCenterTap];
82       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
83       (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
84     } while (++x != width);
85     source += source_stride;
86     *wiener_buffer += width;
87   }
88 }
89 
90 template <int bitdepth, typename Pixel>
WienerVertical(const int16_t * wiener_buffer,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,void * const dest,const ptrdiff_t dest_stride)91 inline void WienerVertical(const int16_t* wiener_buffer, const int width,
92                            const int height, const int16_t* const filter,
93                            const int number_zero_coefficients, void* const dest,
94                            const ptrdiff_t dest_stride) {
95   constexpr int kCenterTap = kWienerFilterTaps / 2;
96   constexpr int kRoundBitsVertical =
97       (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
98   auto* dst = static_cast<Pixel*>(dest);
99   int y = height;
100   do {
101     int x = 0;
102     do {
103       // sum needs 32 bits.
104       int sum = 0;
105       for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
106         sum += filter[k] *
107                (wiener_buffer[k * width + x] +
108                 wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
109       }
110       sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
111       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
112       dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
113     } while (++x != width);
114     wiener_buffer += width;
115     dst += dest_stride;
116   } while (--y != 0);
117 }
118 
119 // Note: bit range for wiener filter.
120 // Wiener filter process first applies horizontal filtering to input pixels,
121 // followed by rounding with predefined bits (dependent on bitdepth).
122 // Then vertical filtering is applied, followed by rounding (dependent on
123 // bitdepth).
124 // The process is the same as convolution:
125 // <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
126 // --> <rounding 1>
127 // By design:
128 // (a). horizontal/vertical filtering adds 7 bits to input.
129 // (b). The output of first rounding fits into 16 bits.
130 // (c). The output of second rounding fits into 16 bits.
131 // If input bitdepth > 8, the accumulator of the horizontal filter is larger
132 // than 16 bit and smaller than 32 bits.
133 // The accumulator of the vertical filter is larger than 16 bits and smaller
134 // than 32 bits.
135 // Note: range of wiener filter coefficients.
136 // Wiener filter coefficients are symmetric, and their sum is 1 (128).
137 // The range of each coefficient:
138 // filter[0] = filter[6], 4 bits, min = -5, max = 10.
139 // filter[1] = filter[5], 5 bits, min = -23, max = 8.
140 // filter[2] = filter[4], 6 bits, min = -17, max = 46.
141 // filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
142 // The difference from libaom is that in libaom:
143 // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
144 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
145 template <int bitdepth, typename Pixel>
WienerFilter_C(const RestorationUnitInfo & restoration_info,const void * const source,const ptrdiff_t stride,const void * const top_border,const ptrdiff_t top_border_stride,const void * const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * const restoration_buffer,void * const dest)146 void WienerFilter_C(
147     const RestorationUnitInfo& restoration_info, const void* const source,
148     const ptrdiff_t stride, const void* const top_border,
149     const ptrdiff_t top_border_stride, const void* const bottom_border,
150     const ptrdiff_t bottom_border_stride, const int width, const int height,
151     RestorationBuffer* const restoration_buffer, void* const dest) {
152   constexpr int kCenterTap = kWienerFilterTaps / 2;
153   const int16_t* const number_leading_zero_coefficients =
154       restoration_info.wiener_info.number_leading_zero_coefficients;
155   const int number_rows_to_skip = std::max(
156       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
157       1);
158   int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
159 
160   // horizontal filtering.
161   const int height_horizontal =
162       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
163   const int height_extra = (height_horizontal - height) >> 1;
164   assert(height_extra <= 2);
165   const int16_t* const filter_horizontal =
166       restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
167   const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
168   const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
169   const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
170   auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
171 
172   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
173     WienerHorizontal<bitdepth, Pixel>(
174         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
175         height_extra, filter_horizontal, 0, &wiener_buffer);
176     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
177                                       filter_horizontal, 0, &wiener_buffer);
178     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
179                                       height_extra, filter_horizontal, 0,
180                                       &wiener_buffer);
181   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
182     WienerHorizontal<bitdepth, Pixel>(
183         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
184         height_extra, filter_horizontal, 1, &wiener_buffer);
185     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
186                                       filter_horizontal, 1, &wiener_buffer);
187     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
188                                       height_extra, filter_horizontal, 1,
189                                       &wiener_buffer);
190   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
191     WienerHorizontal<bitdepth, Pixel>(
192         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
193         height_extra, filter_horizontal, 2, &wiener_buffer);
194     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
195                                       filter_horizontal, 2, &wiener_buffer);
196     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
197                                       height_extra, filter_horizontal, 2,
198                                       &wiener_buffer);
199   } else {
200     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
201     WienerHorizontal<bitdepth, Pixel>(
202         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
203         height_extra, filter_horizontal, 3, &wiener_buffer);
204     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
205                                       filter_horizontal, 3, &wiener_buffer);
206     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
207                                       height_extra, filter_horizontal, 3,
208                                       &wiener_buffer);
209   }
210 
211   // vertical filtering.
212   const int16_t* const filter_vertical =
213       restoration_info.wiener_info.filter[WienerInfo::kVertical];
214   if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
215     // Because the top row of |source| is a duplicate of the second row, and the
216     // bottom row of |source| is a duplicate of its above row, we can duplicate
217     // the top and bottom row of |wiener_buffer| accordingly.
218     memcpy(wiener_buffer, wiener_buffer - width,
219            sizeof(*wiener_buffer) * width);
220     memcpy(wiener_buffer_org, wiener_buffer_org + width,
221            sizeof(*wiener_buffer) * width);
222     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
223                                     filter_vertical, 0, dest, stride);
224   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
225     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
226                                     filter_vertical, 1, dest, stride);
227   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
228     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
229                                     filter_vertical, 2, dest, stride);
230   } else {
231     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
232     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
233                                     filter_vertical, 3, dest, stride);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // SGR
239 
240 // When |height| is 1, |src_stride| could be set to an arbitrary value.
241 template <typename Pixel, int size>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sums,uint32_t * const * square_sums)242 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
243                                   const int height, const int width,
244                                   uint16_t* const* sums,
245                                   uint32_t* const* square_sums) {
246   int y = height;
247   do {
248     uint32_t sum = 0;
249     uint32_t square_sum = 0;
250     for (int dx = 0; dx < size; ++dx) {
251       const Pixel source = src[dx];
252       sum += source;
253       square_sum += source * source;
254     }
255     (*sums)[0] = sum;
256     (*square_sums)[0] = square_sum;
257     int x = 1;
258     do {
259       const Pixel source0 = src[x - 1];
260       const Pixel source1 = src[x - 1 + size];
261       sum -= source0;
262       sum += source1;
263       square_sum -= source0 * source0;
264       square_sum += source1 * source1;
265       (*sums)[x] = sum;
266       (*square_sums)[x] = square_sum;
267     } while (++x != width);
268     src += src_stride;
269     ++sums;
270     ++square_sums;
271   } while (--y != 0);
272 }
273 
274 // When |height| is 1, |src_stride| could be set to an arbitrary value.
275 template <typename Pixel>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sum3,uint16_t * const * sum5,uint32_t * const * square_sum3,uint32_t * const * square_sum5)276 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
277                                   const int height, const int width,
278                                   uint16_t* const* sum3, uint16_t* const* sum5,
279                                   uint32_t* const* square_sum3,
280                                   uint32_t* const* square_sum5) {
281   int y = height;
282   do {
283     uint32_t sum = 0;
284     uint32_t square_sum = 0;
285     for (int dx = 0; dx < 4; ++dx) {
286       const Pixel source = src[dx];
287       sum += source;
288       square_sum += source * source;
289     }
290     int x = 0;
291     do {
292       const Pixel source0 = src[x];
293       const Pixel source1 = src[x + 4];
294       sum -= source0;
295       square_sum -= source0 * source0;
296       (*sum3)[x] = sum;
297       (*square_sum3)[x] = square_sum;
298       sum += source1;
299       square_sum += source1 * source1;
300       (*sum5)[x] = sum + source0;
301       (*square_sum5)[x] = square_sum + source0 * source0;
302     } while (++x != width);
303     src += src_stride;
304     ++sum3;
305     ++sum5;
306     ++square_sum3;
307     ++square_sum5;
308   } while (--y != 0);
309 }
310 
311 template <int bitdepth, int n>
CalculateIntermediate(const uint32_t s,uint32_t a,const uint32_t b,uint8_t * const ma_ptr,uint32_t * const b_ptr)312 inline void CalculateIntermediate(const uint32_t s, uint32_t a,
313                                   const uint32_t b, uint8_t* const ma_ptr,
314                                   uint32_t* const b_ptr) {
315   // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
316   // since max bitdepth = 12, max < 2^31.
317   // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
318   a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
319   // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
320   // d < 2^8 * n < 2^14 regardless of bitdepth
321   const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
322   // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
323   // and p itself satisfies p < 2^14 * n^2 < 2^26.
324   // This bound on p is due to:
325   // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
326   // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
327   // This is an artifact of rounding, and can only happen if all pixels
328   // are (almost) identical, so in this case we saturate to p=0.
329   const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
330   // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
331   // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
332   // (this holds even after accounting for the rounding in s)
333   const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
334   // ma: range [0, 255].
335   const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
336   const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
337   // ma < 2^8, b < 2^(bitdepth) * n,
338   // one_over_n = round(2^12 / n)
339   // => the product here is < 2^(20 + bitdepth) <= 2^32,
340   // and b is set to a value < 2^(8 + bitdepth).
341   // This holds even with the rounding in one_over_n and in the overall result,
342   // as long as ma is strictly less than 2^8.
343   const uint32_t b2 = ma * b * one_over_n;
344   *ma_ptr = ma;
345   *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
346 }
347 
348 template <typename T>
Sum343(const T * const src)349 inline uint32_t Sum343(const T* const src) {
350   return 3 * (src[0] + src[2]) + 4 * src[1];
351 }
352 
353 template <typename T>
Sum444(const T * const src)354 inline uint32_t Sum444(const T* const src) {
355   return 4 * (src[0] + src[1] + src[2]);
356 }
357 
358 template <typename T>
Sum565(const T * const src)359 inline uint32_t Sum565(const T* const src) {
360   return 5 * (src[0] + src[2]) + 6 * src[1];
361 }
362 
363 template <int bitdepth>
BoxFilterPreProcess5(const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],const int width,const uint32_t s,SgrBuffer * const sgr_buffer,uint16_t * const ma565,uint32_t * const b565)364 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
365     const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
366     const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
367     uint16_t* const ma565, uint32_t* const b565) {
368   int x = 0;
369   do {
370     uint32_t a = 0;
371     uint32_t b = 0;
372     for (int dy = 0; dy < 5; ++dy) {
373       a += square_sum5[dy][x];
374       b += sum5[dy][x];
375     }
376     CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
377                                         sgr_buffer->b + x);
378   } while (++x != width + 2);
379   x = 0;
380   do {
381     ma565[x] = Sum565(sgr_buffer->ma + x);
382     b565[x] = Sum565(sgr_buffer->b + x);
383   } while (++x != width);
384 }
385 
386 template <int bitdepth>
BoxFilterPreProcess3(const uint16_t * const sum3[3],const uint32_t * const square_sum3[3],const int width,const uint32_t s,const bool calculate444,SgrBuffer * const sgr_buffer,uint16_t * const ma343,uint32_t * const b343,uint16_t * const ma444,uint32_t * const b444)387 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
388     const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
389     const int width, const uint32_t s, const bool calculate444,
390     SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
391     uint16_t* const ma444, uint32_t* const b444) {
392   int x = 0;
393   do {
394     uint32_t a = 0;
395     uint32_t b = 0;
396     for (int dy = 0; dy < 3; ++dy) {
397       a += square_sum3[dy][x];
398       b += sum3[dy][x];
399     }
400     CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
401                                        sgr_buffer->b + x);
402   } while (++x != width + 2);
403   x = 0;
404   do {
405     ma343[x] = Sum343(sgr_buffer->ma + x);
406     b343[x] = Sum343(sgr_buffer->b + x);
407   } while (++x != width);
408   if (calculate444) {
409     x = 0;
410     do {
411       ma444[x] = Sum444(sgr_buffer->ma + x);
412       b444[x] = Sum444(sgr_buffer->b + x);
413     } while (++x != width);
414   }
415 }
416 
417 template <typename Pixel>
CalculateFilteredOutput(const Pixel src,const uint32_t ma,const uint32_t b,const int shift)418 inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
419                                    const uint32_t b, const int shift) {
420   const int32_t v = b - ma * src;
421   return RightShiftWithRounding(v,
422                                 kSgrProjSgrBits + shift - kSgrProjRestoreBits);
423 }
424 
425 template <typename Pixel>
BoxFilterPass1Kernel(const Pixel src0,const Pixel src1,const uint16_t * const ma565[2],const uint32_t * const b565[2],const ptrdiff_t x,int p[2])426 inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
427                                  const uint16_t* const ma565[2],
428                                  const uint32_t* const b565[2],
429                                  const ptrdiff_t x, int p[2]) {
430   p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
431                                         b565[0][x] + b565[1][x], 5);
432   p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
433 }
434 
435 template <typename Pixel>
BoxFilterPass2Kernel(const Pixel src,const uint16_t * const ma343[3],const uint16_t * const ma444,const uint32_t * const b343[3],const uint32_t * const b444,const ptrdiff_t x)436 inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
437                                 const uint16_t* const ma444,
438                                 const uint32_t* const b343[3],
439                                 const uint32_t* const b444, const ptrdiff_t x) {
440   const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
441   const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
442   return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
443 }
444 
445 template <int bitdepth, typename Pixel>
SelfGuidedFinal(const int src,const int v)446 inline Pixel SelfGuidedFinal(const int src, const int v) {
447   // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
448   // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
449   // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
450   // maximum value of each element.
451   const int s = src + RightShiftWithRounding(
452                           v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
453   return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
454 }
455 
456 template <int bitdepth, typename Pixel>
SelfGuidedDoubleMultiplier(const int src,const int filter0,const int filter1,const int16_t w0,const int16_t w2)457 inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
458                                         const int filter1, const int16_t w0,
459                                         const int16_t w2) {
460   const int v = w0 * filter0 + w2 * filter1;
461   return SelfGuidedFinal<bitdepth, Pixel>(src, v);
462 }
463 
464 template <int bitdepth, typename Pixel>
SelfGuidedSingleMultiplier(const int src,const int filter,const int16_t w0)465 inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
466                                         const int16_t w0) {
467   const int v = w0 * filter;
468   return SelfGuidedFinal<bitdepth, Pixel>(src, v);
469 }
470 
471 template <int bitdepth, typename Pixel>
BoxFilterPass1(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const uint32_t scale,const int16_t w0,SgrBuffer * const sgr_buffer,uint16_t * const ma565[2],uint32_t * const b565[2],Pixel * dst)472 inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
473                            uint16_t* const sum5[5],
474                            uint32_t* const square_sum5[5], const int width,
475                            const uint32_t scale, const int16_t w0,
476                            SgrBuffer* const sgr_buffer,
477                            uint16_t* const ma565[2], uint32_t* const b565[2],
478                            Pixel* dst) {
479   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
480                                  ma565[1], b565[1]);
481   int x = 0;
482   do {
483     int p[2];
484     BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
485     dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
486     dst[stride + x] =
487         SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
488   } while (++x != width);
489 }
490 
491 template <int bitdepth, typename Pixel>
BoxFilterPass2(const Pixel * const src,const Pixel * const src0,const int width,const uint16_t scale,const int16_t w0,uint16_t * const sum3[4],uint32_t * const square_sum3[4],SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint32_t * const b343[4],uint32_t * const b444[3],Pixel * dst)492 inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
493                            const int width, const uint16_t scale,
494                            const int16_t w0, uint16_t* const sum3[4],
495                            uint32_t* const square_sum3[4],
496                            SgrBuffer* const sgr_buffer,
497                            uint16_t* const ma343[4], uint16_t* const ma444[3],
498                            uint32_t* const b343[4], uint32_t* const b444[3],
499                            Pixel* dst) {
500   BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
501   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
502                                  sgr_buffer, ma343[2], b343[2], ma444[1],
503                                  b444[1]);
504   int x = 0;
505   do {
506     const int p =
507         BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
508     dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
509   } while (++x != width);
510 }
511 
512 template <int bitdepth, typename Pixel>
BoxFilter(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],Pixel * dst)513 inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
514                       uint16_t* const sum3[4], uint16_t* const sum5[5],
515                       uint32_t* const square_sum3[4],
516                       uint32_t* const square_sum5[5], const int width,
517                       const uint16_t scales[2], const int16_t w0,
518                       const int16_t w2, SgrBuffer* const sgr_buffer,
519                       uint16_t* const ma343[4], uint16_t* const ma444[3],
520                       uint16_t* const ma565[2], uint32_t* const b343[4],
521                       uint32_t* const b444[3], uint32_t* const b565[2],
522                       Pixel* dst) {
523   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
524                                  sgr_buffer, ma565[1], b565[1]);
525   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
526                                  sgr_buffer, ma343[2], b343[2], ma444[1],
527                                  b444[1]);
528   BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
529                                  true, sgr_buffer, ma343[3], b343[3], ma444[2],
530                                  b444[2]);
531   int x = 0;
532   do {
533     int p[2][2];
534     BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
535     p[1][0] =
536         BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
537     p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
538                                           b343 + 1, b444[1], x);
539     dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
540                                                          p[1][0], w0, w2);
541     dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
542         src[stride + x], p[0][1], p[1][1], w0, w2);
543   } while (++x != width);
544 }
545 
546 template <int bitdepth, typename Pixel>
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)547 inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
548                              const Pixel* src, const ptrdiff_t stride,
549                              const Pixel* const top_border,
550                              const ptrdiff_t top_border_stride,
551                              const Pixel* bottom_border,
552                              const ptrdiff_t bottom_border_stride,
553                              const int width, const int height,
554                              SgrBuffer* const sgr_buffer, Pixel* dst) {
555   const auto temp_stride = Align<ptrdiff_t>(width, 8);
556   const ptrdiff_t sum_stride = temp_stride + 8;
557   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
558   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
559   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
560   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
561   const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
562   uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
563   uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
564   sum3[0] = sgr_buffer->sum3;
565   square_sum3[0] = sgr_buffer->square_sum3;
566   ma343[0] = sgr_buffer->ma343;
567   b343[0] = sgr_buffer->b343;
568   for (int i = 1; i <= 3; ++i) {
569     sum3[i] = sum3[i - 1] + sum_stride;
570     square_sum3[i] = square_sum3[i - 1] + sum_stride;
571     ma343[i] = ma343[i - 1] + temp_stride;
572     b343[i] = b343[i - 1] + temp_stride;
573   }
574   sum5[0] = sgr_buffer->sum5;
575   square_sum5[0] = sgr_buffer->square_sum5;
576   for (int i = 1; i <= 4; ++i) {
577     sum5[i] = sum5[i - 1] + sum_stride;
578     square_sum5[i] = square_sum5[i - 1] + sum_stride;
579   }
580   ma444[0] = sgr_buffer->ma444;
581   b444[0] = sgr_buffer->b444;
582   for (int i = 1; i <= 2; ++i) {
583     ma444[i] = ma444[i - 1] + temp_stride;
584     b444[i] = b444[i - 1] + temp_stride;
585   }
586   ma565[0] = sgr_buffer->ma565;
587   ma565[1] = ma565[0] + temp_stride;
588   b565[0] = sgr_buffer->b565;
589   b565[1] = b565[0] + temp_stride;
590   assert(scales[0] != 0);
591   assert(scales[1] != 0);
592   BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
593                 square_sum3, square_sum5 + 1);
594   sum5[0] = sum5[1];
595   square_sum5[0] = square_sum5[1];
596   BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
597                 square_sum5 + 3);
598   const Pixel* const s = (height > 1) ? src + stride : bottom_border;
599   BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
600                 square_sum5 + 4);
601   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
602                                  sgr_buffer, ma565[0], b565[0]);
603   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
604                                  sgr_buffer, ma343[0], b343[0], nullptr,
605                                  nullptr);
606   BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
607                                  true, sgr_buffer, ma343[1], b343[1], ma444[0],
608                                  b444[0]);
609   sum5[0] = sgr_buffer->sum5;
610   square_sum5[0] = sgr_buffer->square_sum5;
611 
612   for (int y = (height >> 1) - 1; y > 0; --y) {
613     Circulate4PointersBy2<uint16_t>(sum3);
614     Circulate4PointersBy2<uint32_t>(square_sum3);
615     Circulate5PointersBy2<uint16_t>(sum5);
616     Circulate5PointersBy2<uint32_t>(square_sum5);
617     BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
618                   square_sum3 + 2, square_sum5 + 3);
619     BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
620                                square_sum5, width, scales, w0, w2, sgr_buffer,
621                                ma343, ma444, ma565, b343, b444, b565, dst);
622     src += 2 * stride;
623     dst += 2 * stride;
624     Circulate4PointersBy2<uint16_t>(ma343);
625     Circulate4PointersBy2<uint32_t>(b343);
626     std::swap(ma444[0], ma444[2]);
627     std::swap(b444[0], b444[2]);
628     std::swap(ma565[0], ma565[1]);
629     std::swap(b565[0], b565[1]);
630   }
631 
632   Circulate4PointersBy2<uint16_t>(sum3);
633   Circulate4PointersBy2<uint32_t>(square_sum3);
634   Circulate5PointersBy2<uint16_t>(sum5);
635   Circulate5PointersBy2<uint32_t>(square_sum5);
636   if ((height & 1) == 0 || height > 1) {
637     const Pixel* sr;
638     ptrdiff_t s_stride;
639     if ((height & 1) == 0) {
640       sr = bottom_border;
641       s_stride = bottom_border_stride;
642     } else {
643       sr = src + 2 * stride;
644       s_stride = bottom_border - (src + 2 * stride);
645     }
646     BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
647                   square_sum3 + 2, square_sum5 + 3);
648     BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
649                                square_sum5, width, scales, w0, w2, sgr_buffer,
650                                ma343, ma444, ma565, b343, b444, b565, dst);
651   }
652   if ((height & 1) != 0) {
653     src += 3;
654     if (height > 1) {
655       src += 2 * stride;
656       dst += 2 * stride;
657       Circulate4PointersBy2<uint16_t>(sum3);
658       Circulate4PointersBy2<uint32_t>(square_sum3);
659       Circulate5PointersBy2<uint16_t>(sum5);
660       Circulate5PointersBy2<uint32_t>(square_sum5);
661       Circulate4PointersBy2<uint16_t>(ma343);
662       Circulate4PointersBy2<uint32_t>(b343);
663       std::swap(ma444[0], ma444[2]);
664       std::swap(b444[0], b444[2]);
665       std::swap(ma565[0], ma565[1]);
666       std::swap(b565[0], b565[1]);
667     }
668     BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
669                   width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
670                   square_sum5 + 3);
671     sum5[4] = sum5[3];
672     square_sum5[4] = square_sum5[3];
673     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
674                                    sgr_buffer, ma565[1], b565[1]);
675     BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
676                                    sgr_buffer, ma343[2], b343[2], nullptr,
677                                    nullptr);
678     int x = 0;
679     do {
680       const int p0 = CalculateFilteredOutput<Pixel>(
681           src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
682       const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
683                                                  b444[0], x);
684       dst[x] =
685           SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
686     } while (++x != width);
687   }
688 }
689 
690 template <int bitdepth, typename Pixel>
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)691 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
692                                   const Pixel* src, const ptrdiff_t stride,
693                                   const Pixel* const top_border,
694                                   const ptrdiff_t top_border_stride,
695                                   const Pixel* bottom_border,
696                                   const ptrdiff_t bottom_border_stride,
697                                   const int width, const int height,
698                                   SgrBuffer* const sgr_buffer, Pixel* dst) {
699   const auto temp_stride = Align<ptrdiff_t>(width, 8);
700   const ptrdiff_t sum_stride = temp_stride + 8;
701   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
702   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
703   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
704   uint16_t *sum5[5], *ma565[2];
705   uint32_t *square_sum5[5], *b565[2];
706   sum5[0] = sgr_buffer->sum5;
707   square_sum5[0] = sgr_buffer->square_sum5;
708   for (int i = 1; i <= 4; ++i) {
709     sum5[i] = sum5[i - 1] + sum_stride;
710     square_sum5[i] = square_sum5[i - 1] + sum_stride;
711   }
712   ma565[0] = sgr_buffer->ma565;
713   ma565[1] = ma565[0] + temp_stride;
714   b565[0] = sgr_buffer->b565;
715   b565[1] = b565[0] + temp_stride;
716   assert(scale != 0);
717   BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
718                    square_sum5 + 1);
719   sum5[0] = sum5[1];
720   square_sum5[0] = square_sum5[1];
721   BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
722   const Pixel* const s = (height > 1) ? src + stride : bottom_border;
723   BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
724   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
725                                  ma565[0], b565[0]);
726   sum5[0] = sgr_buffer->sum5;
727   square_sum5[0] = sgr_buffer->square_sum5;
728 
729   for (int y = (height >> 1) - 1; y > 0; --y) {
730     Circulate5PointersBy2<uint16_t>(sum5);
731     Circulate5PointersBy2<uint32_t>(square_sum5);
732     BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
733                      square_sum5 + 3);
734     BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
735                                     scale, w0, sgr_buffer, ma565, b565, dst);
736     src += 2 * stride;
737     dst += 2 * stride;
738     std::swap(ma565[0], ma565[1]);
739     std::swap(b565[0], b565[1]);
740   }
741 
742   Circulate5PointersBy2<uint16_t>(sum5);
743   Circulate5PointersBy2<uint32_t>(square_sum5);
744   if ((height & 1) == 0 || height > 1) {
745     const Pixel* sr;
746     ptrdiff_t s_stride;
747     if ((height & 1) == 0) {
748       sr = bottom_border;
749       s_stride = bottom_border_stride;
750     } else {
751       sr = src + 2 * stride;
752       s_stride = bottom_border - (src + 2 * stride);
753     }
754     BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
755     BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
756                                     scale, w0, sgr_buffer, ma565, b565, dst);
757   }
758   if ((height & 1) != 0) {
759     src += 3;
760     if (height > 1) {
761       src += 2 * stride;
762       dst += 2 * stride;
763       std::swap(ma565[0], ma565[1]);
764       std::swap(b565[0], b565[1]);
765       Circulate5PointersBy2<uint16_t>(sum5);
766       Circulate5PointersBy2<uint32_t>(square_sum5);
767     }
768     BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
769                      1, width + 2, sum5 + 3, square_sum5 + 3);
770     sum5[4] = sum5[3];
771     square_sum5[4] = square_sum5[3];
772     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
773                                    ma565[1], b565[1]);
774     int x = 0;
775     do {
776       const int p = CalculateFilteredOutput<Pixel>(
777           src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
778       dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
779     } while (++x != width);
780   }
781 }
782 
783 template <int bitdepth, typename Pixel>
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)784 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
785                                   const Pixel* src, const ptrdiff_t stride,
786                                   const Pixel* const top_border,
787                                   const ptrdiff_t top_border_stride,
788                                   const Pixel* bottom_border,
789                                   const ptrdiff_t bottom_border_stride,
790                                   const int width, const int height,
791                                   SgrBuffer* const sgr_buffer, Pixel* dst) {
792   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
793   const auto temp_stride = Align<ptrdiff_t>(width, 8);
794   const ptrdiff_t sum_stride = temp_stride + 8;
795   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
796   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
797   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
798   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
799   uint16_t *sum3[3], *ma343[3], *ma444[2];
800   uint32_t *square_sum3[3], *b343[3], *b444[2];
801   sum3[0] = sgr_buffer->sum3;
802   square_sum3[0] = sgr_buffer->square_sum3;
803   ma343[0] = sgr_buffer->ma343;
804   b343[0] = sgr_buffer->b343;
805   for (int i = 1; i <= 2; ++i) {
806     sum3[i] = sum3[i - 1] + sum_stride;
807     square_sum3[i] = square_sum3[i - 1] + sum_stride;
808     ma343[i] = ma343[i - 1] + temp_stride;
809     b343[i] = b343[i - 1] + temp_stride;
810   }
811   ma444[0] = sgr_buffer->ma444;
812   ma444[1] = ma444[0] + temp_stride;
813   b444[0] = sgr_buffer->b444;
814   b444[1] = b444[0] + temp_stride;
815   assert(scale != 0);
816   BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
817                    square_sum3);
818   BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
819   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
820                                  sgr_buffer, ma343[0], b343[0], nullptr,
821                                  nullptr);
822   Circulate3PointersBy1<uint16_t>(sum3);
823   Circulate3PointersBy1<uint32_t>(square_sum3);
824   const Pixel* s;
825   if (height > 1) {
826     s = src + stride;
827   } else {
828     s = bottom_border;
829     bottom_border += bottom_border_stride;
830   }
831   BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
832   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
833                                  sgr_buffer, ma343[1], b343[1], ma444[0],
834                                  b444[0]);
835 
836   for (int y = height - 2; y > 0; --y) {
837     Circulate3PointersBy1<uint16_t>(sum3);
838     Circulate3PointersBy1<uint32_t>(square_sum3);
839     BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
840                                     sum3, square_sum3, sgr_buffer, ma343, ma444,
841                                     b343, b444, dst);
842     src += stride;
843     dst += stride;
844     Circulate3PointersBy1<uint16_t>(ma343);
845     Circulate3PointersBy1<uint32_t>(b343);
846     std::swap(ma444[0], ma444[1]);
847     std::swap(b444[0], b444[1]);
848   }
849 
850   src += 2;
851   int y = std::min(height, 2);
852   do {
853     Circulate3PointersBy1<uint16_t>(sum3);
854     Circulate3PointersBy1<uint32_t>(square_sum3);
855     BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
856                                     square_sum3, sgr_buffer, ma343, ma444, b343,
857                                     b444, dst);
858     src += stride;
859     dst += stride;
860     bottom_border += bottom_border_stride;
861     Circulate3PointersBy1<uint16_t>(ma343);
862     Circulate3PointersBy1<uint32_t>(b343);
863     std::swap(ma444[0], ma444[1]);
864     std::swap(b444[0], b444[1]);
865   } while (--y != 0);
866 }
867 
868 template <int bitdepth, typename Pixel>
SelfGuidedFilter_C(const RestorationUnitInfo & restoration_info,const void * const source,const ptrdiff_t stride,const void * const top_border,const ptrdiff_t top_border_stride,const void * const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * const restoration_buffer,void * const dest)869 void SelfGuidedFilter_C(
870     const RestorationUnitInfo& restoration_info, const void* const source,
871     const ptrdiff_t stride, const void* const top_border,
872     const ptrdiff_t top_border_stride, const void* const bottom_border,
873     const ptrdiff_t bottom_border_stride, const int width, const int height,
874     RestorationBuffer* const restoration_buffer, void* const dest) {
875   const int index = restoration_info.sgr_proj_info.index;
876   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
877   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
878   const auto* src = static_cast<const Pixel*>(source);
879   const auto* top = static_cast<const Pixel*>(top_border);
880   const auto* bottom = static_cast<const Pixel*>(bottom_border);
881   auto* dst = static_cast<Pixel*>(dest);
882   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
883   if (radius_pass_1 == 0) {
884     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
885     // following assertion.
886     assert(radius_pass_0 != 0);
887     BoxFilterProcessPass1<bitdepth, Pixel>(
888         restoration_info, src - 3, stride, top - 3, top_border_stride,
889         bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
890   } else if (radius_pass_0 == 0) {
891     BoxFilterProcessPass2<bitdepth, Pixel>(
892         restoration_info, src - 2, stride, top - 2, top_border_stride,
893         bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
894   } else {
895     BoxFilterProcess<bitdepth, Pixel>(
896         restoration_info, src - 3, stride, top - 3, top_border_stride,
897         bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
898   }
899 }
900 
Init8bpp()901 void Init8bpp() {
902   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
903   assert(dsp != nullptr);
904 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
905   dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
906   dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
907 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
908   static_cast<void>(dsp);
909 #ifndef LIBGAV1_Dsp8bpp_WienerFilter
910   dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
911 #endif
912 #ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
913   dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
914 #endif
915 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
916 }
917 
918 #if LIBGAV1_MAX_BITDEPTH >= 10
919 
Init10bpp()920 void Init10bpp() {
921   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
922   assert(dsp != nullptr);
923 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
924   dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
925   dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
926 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
927   static_cast<void>(dsp);
928 #ifndef LIBGAV1_Dsp10bpp_WienerFilter
929   dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
930 #endif
931 #ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
932   dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
933 #endif
934 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
935 }
936 
937 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
938 }  // namespace
939 
LoopRestorationInit_C()940 void LoopRestorationInit_C() {
941   Init8bpp();
942 #if LIBGAV1_MAX_BITDEPTH >= 10
943   Init10bpp();
944 #endif
945 }
946 
947 }  // namespace dsp
948 }  // namespace libgav1
949