1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/convolve.h"
16 
17 #include <cassert>
18 #include <cstddef>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <cstring>
22 
23 #include "src/dsp/constants.h"
24 #include "src/dsp/dsp.h"
25 #include "src/utils/common.h"
26 #include "src/utils/constants.h"
27 
28 namespace libgav1 {
29 namespace dsp {
30 namespace {
31 
32 constexpr int kHorizontalOffset = 3;
33 constexpr int kVerticalOffset = 3;
34 
35 // Compound prediction output ranges from ConvolveTest.ShowRange.
36 // Bitdepth:  8 Input range:            [       0,      255]
37 //   intermediate range:                [   -7140,    23460]
38 //   first pass output range:           [   -1785,     5865]
39 //   intermediate range:                [ -328440,   589560]
40 //   second pass output range:          [       0,      255]
41 //   compound second pass output range: [   -5132,     9212]
42 //
43 // Bitdepth: 10 Input range:            [       0,     1023]
44 //   intermediate range:                [  -28644,    94116]
45 //   first pass output range:           [   -7161,    23529]
46 //   intermediate range:                [-1317624,  2365176]
47 //   second pass output range:          [       0,     1023]
48 //   compound second pass output range: [    3988,    61532]
49 //
50 // Bitdepth: 12 Input range:            [       0,     4095]
51 //   intermediate range:                [ -114660,   376740]
52 //   first pass output range:           [   -7166,    23546]
53 //   intermediate range:                [-1318560,  2366880]
54 //   second pass output range:          [       0,     4095]
55 //   compound second pass output range: [    3974,    61559]
56 
57 template <int bitdepth, typename Pixel>
ConvolveScale2D_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)58 void ConvolveScale2D_C(const void* const reference,
59                        const ptrdiff_t reference_stride,
60                        const int horizontal_filter_index,
61                        const int vertical_filter_index, const int subpixel_x,
62                        const int subpixel_y, const int step_x, const int step_y,
63                        const int width, const int height, void* prediction,
64                        const ptrdiff_t pred_stride) {
65   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
66                                            ? kInterRoundBitsHorizontal12bpp
67                                            : kInterRoundBitsHorizontal;
68   constexpr int kRoundBitsVertical =
69       (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
70   const int intermediate_height =
71       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
72        kScaleSubPixelBits) +
73       kSubPixelTaps;
74   // The output of the horizontal filter, i.e. the intermediate_result, is
75   // guaranteed to fit in int16_t.
76   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
77                               (2 * kMaxSuperBlockSizeInPixels + 8)];
78   const int intermediate_stride = kMaxSuperBlockSizeInPixels;
79   const int max_pixel_value = (1 << bitdepth) - 1;
80 
81   // Horizontal filter.
82   // Filter types used for width <= 4 are different from those for width > 4.
83   // When width > 4, the valid filter index range is always [0, 3].
84   // When width <= 4, the valid filter index range is always [4, 5].
85   // Similarly for height.
86   int filter_index = GetFilterIndex(horizontal_filter_index, width);
87   int16_t* intermediate = intermediate_result;
88   const auto* src = static_cast<const Pixel*>(reference);
89   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
90   auto* dest = static_cast<Pixel*>(prediction);
91   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
92   const int ref_x = subpixel_x >> kScaleSubPixelBits;
93   // Note: assume the input src is already aligned to the correct start
94   // position.
95   int y = 0;
96   do {
97     int p = subpixel_x;
98     int x = 0;
99     do {
100       int sum = 0;
101       const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
102       const int filter_id = (p >> 6) & kSubPixelMask;
103       for (int k = 0; k < kSubPixelTaps; ++k) {
104         sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
105       }
106       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
107       p += step_x;
108     } while (++x < width);
109 
110     src += src_stride;
111     intermediate += intermediate_stride;
112   } while (++y < intermediate_height);
113 
114   // Vertical filter.
115   filter_index = GetFilterIndex(vertical_filter_index, height);
116   intermediate = intermediate_result;
117   int p = subpixel_y & 1023;
118   y = 0;
119   do {
120     const int filter_id = (p >> 6) & kSubPixelMask;
121     int x = 0;
122     do {
123       int sum = 0;
124       for (int k = 0; k < kSubPixelTaps; ++k) {
125         sum +=
126             kHalfSubPixelFilters[filter_index][filter_id][k] *
127             intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
128                          x];
129       }
130       dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
131                       max_pixel_value);
132     } while (++x < width);
133 
134     dest += dest_stride;
135     p += step_y;
136   } while (++y < height);
137 }
138 
139 template <int bitdepth, typename Pixel>
ConvolveCompoundScale2D_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)140 void ConvolveCompoundScale2D_C(const void* const reference,
141                                const ptrdiff_t reference_stride,
142                                const int horizontal_filter_index,
143                                const int vertical_filter_index,
144                                const int subpixel_x, const int subpixel_y,
145                                const int step_x, const int step_y,
146                                const int width, const int height,
147                                void* prediction, const ptrdiff_t pred_stride) {
148   // All compound functions output to the predictor buffer with |pred_stride|
149   // equal to |width|.
150   assert(pred_stride == width);
151   // Compound functions start at 4x4.
152   assert(width >= 4 && height >= 4);
153   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
154                                            ? kInterRoundBitsHorizontal12bpp
155                                            : kInterRoundBitsHorizontal;
156   constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
157   const int intermediate_height =
158       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
159        kScaleSubPixelBits) +
160       kSubPixelTaps;
161   // The output of the horizontal filter, i.e. the intermediate_result, is
162   // guaranteed to fit in int16_t.
163   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
164                               (2 * kMaxSuperBlockSizeInPixels + 8)];
165   const int intermediate_stride = kMaxSuperBlockSizeInPixels;
166 
167   // Horizontal filter.
168   // Filter types used for width <= 4 are different from those for width > 4.
169   // When width > 4, the valid filter index range is always [0, 3].
170   // When width <= 4, the valid filter index range is always [4, 5].
171   // Similarly for height.
172   int filter_index = GetFilterIndex(horizontal_filter_index, width);
173   int16_t* intermediate = intermediate_result;
174   const auto* src = static_cast<const Pixel*>(reference);
175   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
176   auto* dest = static_cast<uint16_t*>(prediction);
177   const int ref_x = subpixel_x >> kScaleSubPixelBits;
178   // Note: assume the input src is already aligned to the correct start
179   // position.
180   int y = 0;
181   do {
182     int p = subpixel_x;
183     int x = 0;
184     do {
185       int sum = 0;
186       const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
187       const int filter_id = (p >> 6) & kSubPixelMask;
188       for (int k = 0; k < kSubPixelTaps; ++k) {
189         sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
190       }
191       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
192       p += step_x;
193     } while (++x < width);
194 
195     src += src_stride;
196     intermediate += intermediate_stride;
197   } while (++y < intermediate_height);
198 
199   // Vertical filter.
200   filter_index = GetFilterIndex(vertical_filter_index, height);
201   intermediate = intermediate_result;
202   int p = subpixel_y & 1023;
203   y = 0;
204   do {
205     const int filter_id = (p >> 6) & kSubPixelMask;
206     int x = 0;
207     do {
208       int sum = 0;
209       for (int k = 0; k < kSubPixelTaps; ++k) {
210         sum +=
211             kHalfSubPixelFilters[filter_index][filter_id][k] *
212             intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
213                          x];
214       }
215       sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
216       sum += (bitdepth == 8) ? 0 : kCompoundOffset;
217       dest[x] = sum;
218     } while (++x < width);
219 
220     dest += pred_stride;
221     p += step_y;
222   } while (++y < height);
223 }
224 
225 template <int bitdepth, typename Pixel>
ConvolveCompound2D_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)226 void ConvolveCompound2D_C(const void* const reference,
227                           const ptrdiff_t reference_stride,
228                           const int horizontal_filter_index,
229                           const int vertical_filter_index,
230                           const int horizontal_filter_id,
231                           const int vertical_filter_id, const int width,
232                           const int height, void* prediction,
233                           const ptrdiff_t pred_stride) {
234   // All compound functions output to the predictor buffer with |pred_stride|
235   // equal to |width|.
236   assert(pred_stride == width);
237   // Compound functions start at 4x4.
238   assert(width >= 4 && height >= 4);
239   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
240                                            ? kInterRoundBitsHorizontal12bpp
241                                            : kInterRoundBitsHorizontal;
242   constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
243   const int intermediate_height = height + kSubPixelTaps - 1;
244   // The output of the horizontal filter, i.e. the intermediate_result, is
245   // guaranteed to fit in int16_t.
246   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
247                               (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
248   const int intermediate_stride = kMaxSuperBlockSizeInPixels;
249 
250   // Horizontal filter.
251   // Filter types used for width <= 4 are different from those for width > 4.
252   // When width > 4, the valid filter index range is always [0, 3].
253   // When width <= 4, the valid filter index range is always [4, 5].
254   // Similarly for height.
255   int filter_index = GetFilterIndex(horizontal_filter_index, width);
256   int16_t* intermediate = intermediate_result;
257   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
258   const auto* src = static_cast<const Pixel*>(reference) -
259                     kVerticalOffset * src_stride - kHorizontalOffset;
260   auto* dest = static_cast<uint16_t*>(prediction);
261 
262   // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
263   assert(horizontal_filter_id != 0);
264   int y = 0;
265   do {
266     int x = 0;
267     do {
268       int sum = 0;
269       for (int k = 0; k < kSubPixelTaps; ++k) {
270         sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
271                src[x + k];
272       }
273       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
274     } while (++x < width);
275 
276     src += src_stride;
277     intermediate += intermediate_stride;
278   } while (++y < intermediate_height);
279 
280   // Vertical filter.
281   filter_index = GetFilterIndex(vertical_filter_index, height);
282   intermediate = intermediate_result;
283   // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
284   assert(vertical_filter_id != 0);
285   y = 0;
286   do {
287     int x = 0;
288     do {
289       int sum = 0;
290       for (int k = 0; k < kSubPixelTaps; ++k) {
291         sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
292                intermediate[k * intermediate_stride + x];
293       }
294       sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
295       sum += (bitdepth == 8) ? 0 : kCompoundOffset;
296       dest[x] = sum;
297     } while (++x < width);
298 
299     dest += pred_stride;
300     intermediate += intermediate_stride;
301   } while (++y < height);
302 }
303 
304 // This function is a simplified version of ConvolveCompound2D_C.
305 // It is called when it is single prediction mode, where both horizontal and
306 // vertical filtering are required.
307 // The output is the single prediction of the block, clipped to valid pixel
308 // range.
309 template <int bitdepth, typename Pixel>
Convolve2D_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)310 void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
311                   const int horizontal_filter_index,
312                   const int vertical_filter_index,
313                   const int horizontal_filter_id, const int vertical_filter_id,
314                   const int width, const int height, void* prediction,
315                   const ptrdiff_t pred_stride) {
316   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
317                                            ? kInterRoundBitsHorizontal12bpp
318                                            : kInterRoundBitsHorizontal;
319   constexpr int kRoundBitsVertical =
320       (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
321   const int intermediate_height = height + kSubPixelTaps - 1;
322   // The output of the horizontal filter, i.e. the intermediate_result, is
323   // guaranteed to fit in int16_t.
324   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
325                               (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
326   const int intermediate_stride = kMaxSuperBlockSizeInPixels;
327   const int max_pixel_value = (1 << bitdepth) - 1;
328 
329   // Horizontal filter.
330   // Filter types used for width <= 4 are different from those for width > 4.
331   // When width > 4, the valid filter index range is always [0, 3].
332   // When width <= 4, the valid filter index range is always [4, 5].
333   // Similarly for height.
334   int filter_index = GetFilterIndex(horizontal_filter_index, width);
335   int16_t* intermediate = intermediate_result;
336   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
337   const auto* src = static_cast<const Pixel*>(reference) -
338                     kVerticalOffset * src_stride - kHorizontalOffset;
339   auto* dest = static_cast<Pixel*>(prediction);
340   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
341   // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
342   assert(horizontal_filter_id != 0);
343   int y = 0;
344   do {
345     int x = 0;
346     do {
347       int sum = 0;
348       for (int k = 0; k < kSubPixelTaps; ++k) {
349         sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
350                src[x + k];
351       }
352       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
353     } while (++x < width);
354 
355     src += src_stride;
356     intermediate += intermediate_stride;
357   } while (++y < intermediate_height);
358 
359   // Vertical filter.
360   filter_index = GetFilterIndex(vertical_filter_index, height);
361   intermediate = intermediate_result;
362   // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
363   assert(vertical_filter_id != 0);
364   y = 0;
365   do {
366     int x = 0;
367     do {
368       int sum = 0;
369       for (int k = 0; k < kSubPixelTaps; ++k) {
370         sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
371                intermediate[k * intermediate_stride + x];
372       }
373       dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
374                       max_pixel_value);
375     } while (++x < width);
376 
377     dest += dest_stride;
378     intermediate += intermediate_stride;
379   } while (++y < height);
380 }
381 
382 // This function is a simplified version of Convolve2D_C.
383 // It is called when it is single prediction mode, where only horizontal
384 // filtering is required.
385 // The output is the single prediction of the block, clipped to valid pixel
386 // range.
387 template <int bitdepth, typename Pixel>
ConvolveHorizontal_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)388 void ConvolveHorizontal_C(const void* const reference,
389                           const ptrdiff_t reference_stride,
390                           const int horizontal_filter_index,
391                           const int /*vertical_filter_index*/,
392                           const int horizontal_filter_id,
393                           const int /*vertical_filter_id*/, const int width,
394                           const int height, void* prediction,
395                           const ptrdiff_t pred_stride) {
396   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
397                                            ? kInterRoundBitsHorizontal12bpp
398                                            : kInterRoundBitsHorizontal;
399   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
400   const int bits = kFilterBits - kRoundBitsHorizontal;
401   const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
402   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
403   auto* dest = static_cast<Pixel*>(prediction);
404   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
405   const int max_pixel_value = (1 << bitdepth) - 1;
406   int y = 0;
407   do {
408     int x = 0;
409     do {
410       int sum = 0;
411       for (int k = 0; k < kSubPixelTaps; ++k) {
412         sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
413                src[x + k];
414       }
415       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
416       dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
417     } while (++x < width);
418 
419     src += src_stride;
420     dest += dest_stride;
421   } while (++y < height);
422 }
423 
424 // This function is a simplified version of Convolve2D_C.
425 // It is called when it is single prediction mode, where only vertical
426 // filtering is required.
427 // The output is the single prediction of the block, clipped to valid pixel
428 // range.
429 template <int bitdepth, typename Pixel>
ConvolveVertical_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)430 void ConvolveVertical_C(const void* const reference,
431                         const ptrdiff_t reference_stride,
432                         const int /*horizontal_filter_index*/,
433                         const int vertical_filter_index,
434                         const int /*horizontal_filter_id*/,
435                         const int vertical_filter_id, const int width,
436                         const int height, void* prediction,
437                         const ptrdiff_t pred_stride) {
438   const int filter_index = GetFilterIndex(vertical_filter_index, height);
439   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
440   const auto* src =
441       static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
442   auto* dest = static_cast<Pixel*>(prediction);
443   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
444   // Copy filters must call ConvolveCopy().
445   assert(vertical_filter_id != 0);
446 
447   const int max_pixel_value = (1 << bitdepth) - 1;
448   int y = 0;
449   do {
450     int x = 0;
451     do {
452       int sum = 0;
453       for (int k = 0; k < kSubPixelTaps; ++k) {
454         sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
455                src[k * src_stride + x];
456       }
457       dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
458                       max_pixel_value);
459     } while (++x < width);
460 
461     src += src_stride;
462     dest += dest_stride;
463   } while (++y < height);
464 }
465 
466 template <int bitdepth, typename Pixel>
ConvolveCopy_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)467 void ConvolveCopy_C(const void* const reference,
468                     const ptrdiff_t reference_stride,
469                     const int /*horizontal_filter_index*/,
470                     const int /*vertical_filter_index*/,
471                     const int /*horizontal_filter_id*/,
472                     const int /*vertical_filter_id*/, const int width,
473                     const int height, void* prediction,
474                     const ptrdiff_t pred_stride) {
475   const auto* src = static_cast<const uint8_t*>(reference);
476   auto* dest = static_cast<uint8_t*>(prediction);
477   int y = 0;
478   do {
479     memcpy(dest, src, width * sizeof(Pixel));
480     src += reference_stride;
481     dest += pred_stride;
482   } while (++y < height);
483 }
484 
485 template <int bitdepth, typename Pixel>
ConvolveCompoundCopy_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)486 void ConvolveCompoundCopy_C(const void* const reference,
487                             const ptrdiff_t reference_stride,
488                             const int /*horizontal_filter_index*/,
489                             const int /*vertical_filter_index*/,
490                             const int /*horizontal_filter_id*/,
491                             const int /*vertical_filter_id*/, const int width,
492                             const int height, void* prediction,
493                             const ptrdiff_t pred_stride) {
494   // All compound functions output to the predictor buffer with |pred_stride|
495   // equal to |width|.
496   assert(pred_stride == width);
497   // Compound functions start at 4x4.
498   assert(width >= 4 && height >= 4);
499   constexpr int kRoundBitsVertical =
500       ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
501                         : kInterRoundBitsVertical) -
502       kInterRoundBitsCompoundVertical;
503   const auto* src = static_cast<const Pixel*>(reference);
504   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
505   auto* dest = static_cast<uint16_t*>(prediction);
506   int y = 0;
507   do {
508     int x = 0;
509     do {
510       int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
511       sum += src[x];
512       dest[x] = sum << kRoundBitsVertical;
513     } while (++x < width);
514     src += src_stride;
515     dest += pred_stride;
516   } while (++y < height);
517 }
518 
519 // This function is a simplified version of ConvolveCompound2D_C.
520 // It is called when it is compound prediction mode, where only horizontal
521 // filtering is required.
522 // The output is not clipped to valid pixel range. Its output will be
523 // blended with another predictor to generate the final prediction of the block.
524 template <int bitdepth, typename Pixel>
ConvolveCompoundHorizontal_C(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)525 void ConvolveCompoundHorizontal_C(
526     const void* const reference, const ptrdiff_t reference_stride,
527     const int horizontal_filter_index, const int /*vertical_filter_index*/,
528     const int horizontal_filter_id, const int /*vertical_filter_id*/,
529     const int width, const int height, void* prediction,
530     const ptrdiff_t pred_stride) {
531   // All compound functions output to the predictor buffer with |pred_stride|
532   // equal to |width|.
533   assert(pred_stride == width);
534   // Compound functions start at 4x4.
535   assert(width >= 4 && height >= 4);
536   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
537                                            ? kInterRoundBitsHorizontal12bpp
538                                            : kInterRoundBitsHorizontal;
539   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
540   const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
541   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
542   auto* dest = static_cast<uint16_t*>(prediction);
543   // Copy filters must call ConvolveCopy().
544   assert(horizontal_filter_id != 0);
545   int y = 0;
546   do {
547     int x = 0;
548     do {
549       int sum = 0;
550       for (int k = 0; k < kSubPixelTaps; ++k) {
551         sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
552                src[x + k];
553       }
554       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
555       sum += (bitdepth == 8) ? 0 : kCompoundOffset;
556       dest[x] = sum;
557     } while (++x < width);
558 
559     src += src_stride;
560     dest += pred_stride;
561   } while (++y < height);
562 }
563 
564 // This function is a simplified version of ConvolveCompound2D_C.
565 // It is called when it is compound prediction mode, where only vertical
566 // filtering is required.
567 // The output is not clipped to valid pixel range. Its output will be
568 // blended with another predictor to generate the final prediction of the block.
569 template <int bitdepth, typename Pixel>
ConvolveCompoundVertical_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)570 void ConvolveCompoundVertical_C(const void* const reference,
571                                 const ptrdiff_t reference_stride,
572                                 const int /*horizontal_filter_index*/,
573                                 const int vertical_filter_index,
574                                 const int /*horizontal_filter_id*/,
575                                 const int vertical_filter_id, const int width,
576                                 const int height, void* prediction,
577                                 const ptrdiff_t pred_stride) {
578   // All compound functions output to the predictor buffer with |pred_stride|
579   // equal to |width|.
580   assert(pred_stride == width);
581   // Compound functions start at 4x4.
582   assert(width >= 4 && height >= 4);
583   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
584                                            ? kInterRoundBitsHorizontal12bpp
585                                            : kInterRoundBitsHorizontal;
586   const int filter_index = GetFilterIndex(vertical_filter_index, height);
587   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
588   const auto* src =
589       static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
590   auto* dest = static_cast<uint16_t*>(prediction);
591   // Copy filters must call ConvolveCopy().
592   assert(vertical_filter_id != 0);
593   int y = 0;
594   do {
595     int x = 0;
596     do {
597       int sum = 0;
598       for (int k = 0; k < kSubPixelTaps; ++k) {
599         sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
600                src[k * src_stride + x];
601       }
602       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
603       sum += (bitdepth == 8) ? 0 : kCompoundOffset;
604       dest[x] = sum;
605     } while (++x < width);
606     src += src_stride;
607     dest += pred_stride;
608   } while (++y < height);
609 }
610 
611 // This function is used when intra block copy is present.
612 // It is called when it is single prediction mode for U/V plane, where the
613 // reference block is from current frame and both horizontal and vertical
614 // filtering are required.
615 // The output is the single prediction of the block, clipped to valid pixel
616 // range.
617 template <int bitdepth, typename Pixel>
ConvolveIntraBlockCopy2D_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)618 void ConvolveIntraBlockCopy2D_C(const void* const reference,
619                                 const ptrdiff_t reference_stride,
620                                 const int /*horizontal_filter_index*/,
621                                 const int /*vertical_filter_index*/,
622                                 const int /*horizontal_filter_id*/,
623                                 const int /*vertical_filter_id*/,
624                                 const int width, const int height,
625                                 void* prediction, const ptrdiff_t pred_stride) {
626   assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
627   assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
628   const auto* src = static_cast<const Pixel*>(reference);
629   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
630   auto* dest = static_cast<Pixel*>(prediction);
631   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
632   const int intermediate_height = height + 1;
633   uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
634                                (kMaxSuperBlockSizeInPixels + 1)];
635   uint16_t* intermediate = intermediate_result;
636   // Note: allow vertical access to height + 1. Because this function is only
637   // for u/v plane of intra block copy, such access is guaranteed to be within
638   // the prediction block.
639   int y = 0;
640   do {
641     int x = 0;
642     do {
643       intermediate[x] = src[x] + src[x + 1];
644     } while (++x < width);
645 
646     src += src_stride;
647     intermediate += width;
648   } while (++y < intermediate_height);
649 
650   intermediate = intermediate_result;
651   y = 0;
652   do {
653     int x = 0;
654     do {
655       dest[x] =
656           RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
657     } while (++x < width);
658 
659     intermediate += width;
660     dest += dest_stride;
661   } while (++y < height);
662 }
663 
664 // This function is used when intra block copy is present.
665 // It is called when it is single prediction mode for U/V plane, where the
666 // reference block is from the current frame and only horizontal or vertical
667 // filtering is required.
668 // The output is the single prediction of the block, clipped to valid pixel
669 // range.
670 // The filtering of intra block copy is simply the average of current and
671 // the next pixel.
672 template <int bitdepth, typename Pixel, bool is_horizontal>
ConvolveIntraBlockCopy1D_C(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)673 void ConvolveIntraBlockCopy1D_C(const void* const reference,
674                                 const ptrdiff_t reference_stride,
675                                 const int /*horizontal_filter_index*/,
676                                 const int /*vertical_filter_index*/,
677                                 const int /*horizontal_filter_id*/,
678                                 const int /*vertical_filter_id*/,
679                                 const int width, const int height,
680                                 void* prediction, const ptrdiff_t pred_stride) {
681   assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
682   assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
683   const auto* src = static_cast<const Pixel*>(reference);
684   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
685   auto* dest = static_cast<Pixel*>(prediction);
686   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
687   const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
688   int y = 0;
689   do {
690     int x = 0;
691     do {
692       dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
693     } while (++x < width);
694 
695     src += src_stride;
696     dest += dest_stride;
697   } while (++y < height);
698 }
699 
Init8bpp()700 void Init8bpp() {
701   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
702   assert(dsp != nullptr);
703 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
704   dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
705   dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
706   dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
707   dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
708 
709   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
710   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
711   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
712   dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
713 
714   dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
715   dsp->convolve[1][0][0][1] =
716       ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
717   dsp->convolve[1][0][1][0] =
718       ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
719   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
720 
721   dsp->convolve[1][1][0][0] = nullptr;
722   dsp->convolve[1][1][0][1] = nullptr;
723   dsp->convolve[1][1][1][0] = nullptr;
724   dsp->convolve[1][1][1][1] = nullptr;
725 
726   dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
727   dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
728 #else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
729 #ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
730   dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
731 #endif
732 #ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
733   dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
734 #endif
735 #ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
736   dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
737 #endif
738 #ifndef LIBGAV1_Dsp8bpp_Convolve2D
739   dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
740 #endif
741 
742 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
743   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
744 #endif
745 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
746   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
747 #endif
748 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
749   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
750 #endif
751 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
752   dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
753 #endif
754 
755 #ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
756   dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
757 #endif
758 #ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
759   dsp->convolve[1][0][0][1] =
760       ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
761 #endif
762 #ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
763   dsp->convolve[1][0][1][0] =
764       ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
765 #endif
766 #ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
767   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
768 #endif
769 
770   dsp->convolve[1][1][0][0] = nullptr;
771   dsp->convolve[1][1][0][1] = nullptr;
772   dsp->convolve[1][1][1][0] = nullptr;
773   dsp->convolve[1][1][1][1] = nullptr;
774 
775 #ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
776   dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
777 #endif
778 #ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
779   dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
780 #endif
781 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
782 }
783 
784 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()785 void Init10bpp() {
786   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
787   assert(dsp != nullptr);
788 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
789   dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
790   dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
791   dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
792   dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
793 
794   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
795   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
796   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
797   dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
798 
799   dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
800   dsp->convolve[1][0][0][1] =
801       ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
802   dsp->convolve[1][0][1][0] =
803       ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
804   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
805 
806   dsp->convolve[1][1][0][0] = nullptr;
807   dsp->convolve[1][1][0][1] = nullptr;
808   dsp->convolve[1][1][1][0] = nullptr;
809   dsp->convolve[1][1][1][1] = nullptr;
810 
811   dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
812   dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
813 #else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
814 #ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
815   dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
816 #endif
817 #ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
818   dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
819 #endif
820 #ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
821   dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
822 #endif
823 #ifndef LIBGAV1_Dsp10bpp_Convolve2D
824   dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
825 #endif
826 
827 #ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
828   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
829 #endif
830 #ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
831   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
832 #endif
833 #ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
834   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
835 #endif
836 #ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
837   dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
838 #endif
839 
840 #ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
841   dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
842 #endif
843 #ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
844   dsp->convolve[1][0][0][1] =
845       ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
846 #endif
847 #ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
848   dsp->convolve[1][0][1][0] =
849       ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
850 #endif
851 #ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
852   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
853 #endif
854 
855   dsp->convolve[1][1][0][0] = nullptr;
856   dsp->convolve[1][1][0][1] = nullptr;
857   dsp->convolve[1][1][1][0] = nullptr;
858   dsp->convolve[1][1][1][1] = nullptr;
859 
860 #ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
861   dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
862 #endif
863 #ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
864   dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
865 #endif
866 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
867 }
868 #endif
869 
870 }  // namespace
871 
ConvolveInit_C()872 void ConvolveInit_C() {
873   Init8bpp();
874 #if LIBGAV1_MAX_BITDEPTH >= 10
875   Init10bpp();
876 #endif
877 }
878 
879 }  // namespace dsp
880 }  // namespace libgav1
881